diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,248869 +0,0 @@ -{ - "best_metric": 0.024749755859375, - "best_model_checkpoint": "./results_morgan/checkpoint-4000000", - "epoch": 0.0012, - "eval_steps": 20000, - "global_step": 4120000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 4.932819633999246e-05, - "loss": 2.5863, - "step": 100 - }, - { - "epoch": 0.0, - "learning_rate": 5.719504324825564e-05, - "loss": 1.4847, - "step": 200 - }, - { - "epoch": 0.0, - "learning_rate": 6.170868326030393e-05, - "loss": 1.2923, - "step": 300 - }, - { - "epoch": 0.0, - "learning_rate": 6.488740554563935e-05, - "loss": 1.186, - "step": 400 - }, - { - "epoch": 0.0, - "learning_rate": 6.734317372309117e-05, - "loss": 1.1051, - "step": 500 - }, - { - "epoch": 0.0, - "learning_rate": 6.934466112452983e-05, - "loss": 1.0482, - "step": 600 - }, - { - "epoch": 0.0, - "learning_rate": 7.103398676137137e-05, - "loss": 0.9834, - "step": 700 - }, - { - "epoch": 0.0, - "learning_rate": 7.24955125606774e-05, - "loss": 0.9314, - "step": 800 - }, - { - "epoch": 0.0, - "learning_rate": 7.378343796989793e-05, - "loss": 0.8974, - "step": 900 - }, - { - "epoch": 0.0, - "learning_rate": 7.493465960993282e-05, - "loss": 0.8582, - "step": 1000 - }, - { - "epoch": 0.0, - "learning_rate": 7.59754330499353e-05, - "loss": 0.8014, - "step": 1100 - }, - { - "epoch": 0.0, - "learning_rate": 7.692510816983375e-05, - "loss": 0.8006, - "step": 1200 - }, - { - "epoch": 0.0, - "learning_rate": 7.779835690831703e-05, - "loss": 0.7641, - "step": 1300 - }, - { - "epoch": 0.0, - "learning_rate": 7.860656934404977e-05, - "loss": 0.74, - "step": 1400 - }, - { - "epoch": 0.0, - "learning_rate": 7.93587649369845e-05, - "loss": 0.7245, - "step": 1500 - }, - { - "epoch": 0.0, - "learning_rate": 8.006220792650233e-05, - "loss": 0.7122, - "step": 1600 - }, - { - "epoch": 0.0, - "learning_rate": 8.07228351498672e-05, - "loss": 0.6768, - "step": 1700 - }, - { - "epoch": 0.0, - "learning_rate": 8.134556096770183e-05, - "loss": 0.656, - "step": 1800 - }, - { - "epoch": 0.0, - "learning_rate": 8.193449936668137e-05, - "loss": 0.6605, - "step": 1900 - }, - { - "epoch": 0.0, - "learning_rate": 8.249312884939091e-05, - "loss": 0.6269, - "step": 2000 - }, - { - "epoch": 0.0, - "learning_rate": 8.302441693357058e-05, - "loss": 0.6224, - "step": 2100 - }, - { - "epoch": 0.0, - "learning_rate": 8.353091558096729e-05, - "loss": 0.6004, - "step": 2200 - }, - { - "epoch": 0.0, - "learning_rate": 8.40148353391312e-05, - "loss": 0.579, - "step": 2300 - }, - { - "epoch": 0.0, - "learning_rate": 8.447810365175978e-05, - "loss": 0.5846, - "step": 2400 - }, - { - "epoch": 0.0, - "learning_rate": 8.49224112285631e-05, - "loss": 0.5704, - "step": 2500 - }, - { - "epoch": 0.0, - "learning_rate": 8.534924929370153e-05, - "loss": 0.5632, - "step": 2600 - }, - { - "epoch": 0.0, - "learning_rate": 8.575590883528774e-05, - "loss": 0.5384, - "step": 2700 - }, - { - "epoch": 0.0, - "learning_rate": 8.615177339252512e-05, - "loss": 0.5403, - "step": 2800 - }, - { - "epoch": 0.0, - "learning_rate": 8.653371084119956e-05, - "loss": 0.5251, - "step": 2900 - }, - { - "epoch": 0.0, - "learning_rate": 8.690266792100479e-05, - "loss": 0.5205, - "step": 3000 - }, - { - "epoch": 0.0, - "learning_rate": 8.725949800007947e-05, - "loss": 0.5042, - "step": 3100 - }, - { - "epoch": 0.0, - "learning_rate": 8.760497296348612e-05, - "loss": 0.5014, - "step": 3200 - }, - { - "epoch": 0.0, - "learning_rate": 8.793979326779768e-05, - "loss": 0.4824, - "step": 3300 - }, - { - "epoch": 0.0, - "learning_rate": 8.826459649112794e-05, - "loss": 0.4899, - "step": 3400 - }, - { - "epoch": 0.0, - "learning_rate": 8.857996464094115e-05, - "loss": 0.479, - "step": 3500 - }, - { - "epoch": 0.0, - "learning_rate": 8.888643043011622e-05, - "loss": 0.472, - "step": 3600 - }, - { - "epoch": 0.0, - "learning_rate": 8.918448269127446e-05, - "loss": 0.4624, - "step": 3700 - }, - { - "epoch": 0.0, - "learning_rate": 8.947457106756977e-05, - "loss": 0.4584, - "step": 3800 - }, - { - "epoch": 0.0, - "learning_rate": 8.975711009295404e-05, - "loss": 0.4585, - "step": 3900 - }, - { - "epoch": 0.0, - "learning_rate": 9.002704319460056e-05, - "loss": 0.4445, - "step": 4000 - }, - { - "epoch": 0.0, - "learning_rate": 9.029573698745933e-05, - "loss": 0.4458, - "step": 4100 - }, - { - "epoch": 0.0, - "learning_rate": 9.055794152084609e-05, - "loss": 0.4392, - "step": 4200 - }, - { - "epoch": 0.0, - "learning_rate": 9.081396286331679e-05, - "loss": 0.4284, - "step": 4300 - }, - { - "epoch": 0.0, - "learning_rate": 9.106408592760968e-05, - "loss": 0.4272, - "step": 4400 - }, - { - "epoch": 0.0, - "learning_rate": 9.130857637656785e-05, - "loss": 0.4085, - "step": 4500 - }, - { - "epoch": 0.0, - "learning_rate": 9.154768231915052e-05, - "loss": 0.4034, - "step": 4600 - }, - { - "epoch": 0.0, - "learning_rate": 9.178163582367895e-05, - "loss": 0.4052, - "step": 4700 - }, - { - "epoch": 0.0, - "learning_rate": 9.201065427145362e-05, - "loss": 0.4068, - "step": 4800 - }, - { - "epoch": 0.0, - "learning_rate": 9.223494157053206e-05, - "loss": 0.4046, - "step": 4900 - }, - { - "epoch": 0.0, - "learning_rate": 9.245468924665303e-05, - "loss": 0.3919, - "step": 5000 - }, - { - "epoch": 0.0, - "learning_rate": 9.267007742593345e-05, - "loss": 0.3837, - "step": 5100 - }, - { - "epoch": 0.0, - "learning_rate": 9.288127572197122e-05, - "loss": 0.3848, - "step": 5200 - }, - { - "epoch": 0.0, - "learning_rate": 9.308844403830141e-05, - "loss": 0.3861, - "step": 5300 - }, - { - "epoch": 0.0, - "learning_rate": 9.329173329571588e-05, - "loss": 0.3732, - "step": 5400 - }, - { - "epoch": 0.0, - "learning_rate": 9.348930861125227e-05, - "loss": 0.3717, - "step": 5500 - }, - { - "epoch": 0.0, - "learning_rate": 9.368529519716057e-05, - "loss": 0.3737, - "step": 5600 - }, - { - "epoch": 0.0, - "learning_rate": 9.387780665987676e-05, - "loss": 0.3573, - "step": 5700 - }, - { - "epoch": 0.0, - "learning_rate": 9.40669640931859e-05, - "loss": 0.3696, - "step": 5800 - }, - { - "epoch": 0.0, - "learning_rate": 9.425288236967753e-05, - "loss": 0.3638, - "step": 5900 - }, - { - "epoch": 0.0, - "learning_rate": 9.443567055973278e-05, - "loss": 0.3572, - "step": 6000 - }, - { - "epoch": 0.0, - "learning_rate": 9.461543231582187e-05, - "loss": 0.3521, - "step": 6100 - }, - { - "epoch": 0.0, - "learning_rate": 9.479226622550294e-05, - "loss": 0.3458, - "step": 6200 - }, - { - "epoch": 0.0, - "learning_rate": 9.496626613613171e-05, - "loss": 0.3307, - "step": 6300 - }, - { - "epoch": 0.0, - "learning_rate": 9.513752145396e-05, - "loss": 0.3307, - "step": 6400 - }, - { - "epoch": 0.0, - "learning_rate": 9.530611742000922e-05, - "loss": 0.3446, - "step": 6500 - }, - { - "epoch": 0.0, - "learning_rate": 9.547213536485023e-05, - "loss": 0.3377, - "step": 6600 - }, - { - "epoch": 0.0, - "learning_rate": 9.563565294419558e-05, - "loss": 0.3245, - "step": 6700 - }, - { - "epoch": 0.0, - "learning_rate": 9.579674435701253e-05, - "loss": 0.3279, - "step": 6800 - }, - { - "epoch": 0.0, - "learning_rate": 9.595548054769064e-05, - "loss": 0.3227, - "step": 6900 - }, - { - "epoch": 0.0, - "learning_rate": 9.611192939364202e-05, - "loss": 0.3319, - "step": 7000 - }, - { - "epoch": 0.0, - "learning_rate": 9.626615587957666e-05, - "loss": 0.3167, - "step": 7100 - }, - { - "epoch": 0.0, - "learning_rate": 9.641822225957206e-05, - "loss": 0.3196, - "step": 7200 - }, - { - "epoch": 0.0, - "learning_rate": 9.656818820794935e-05, - "loss": 0.313, - "step": 7300 - }, - { - "epoch": 0.0, - "learning_rate": 9.671611095987065e-05, - "loss": 0.3142, - "step": 7400 - }, - { - "epoch": 0.0, - "learning_rate": 9.685769582820096e-05, - "loss": 0.3081, - "step": 7500 - }, - { - "epoch": 0.0, - "learning_rate": 9.700175210160166e-05, - "loss": 0.3183, - "step": 7600 - }, - { - "epoch": 0.0, - "learning_rate": 9.714392202673169e-05, - "loss": 0.3224, - "step": 7700 - }, - { - "epoch": 0.0, - "learning_rate": 9.72842543674037e-05, - "loss": 0.3041, - "step": 7800 - }, - { - "epoch": 0.0, - "learning_rate": 9.742279602065062e-05, - "loss": 0.2954, - "step": 7900 - }, - { - "epoch": 0.0, - "learning_rate": 9.755959211081178e-05, - "loss": 0.3045, - "step": 8000 - }, - { - "epoch": 0.0, - "learning_rate": 9.769468607776539e-05, - "loss": 0.2967, - "step": 8100 - }, - { - "epoch": 0.0, - "learning_rate": 9.782811975973896e-05, - "loss": 0.3057, - "step": 8200 - }, - { - "epoch": 0.0, - "learning_rate": 9.795993347109249e-05, - "loss": 0.2927, - "step": 8300 - }, - { - "epoch": 0.0, - "learning_rate": 9.809016607543647e-05, - "loss": 0.3007, - "step": 8400 - }, - { - "epoch": 0.0, - "learning_rate": 9.821885505441631e-05, - "loss": 0.2893, - "step": 8500 - }, - { - "epoch": 0.0, - "learning_rate": 9.834477210358549e-05, - "loss": 0.2889, - "step": 8600 - }, - { - "epoch": 0.0, - "learning_rate": 9.847049562565526e-05, - "loss": 0.2926, - "step": 8700 - }, - { - "epoch": 0.0, - "learning_rate": 9.859477997323254e-05, - "loss": 0.2776, - "step": 8800 - }, - { - "epoch": 0.0, - "learning_rate": 9.871765772262129e-05, - "loss": 0.2688, - "step": 8900 - }, - { - "epoch": 0.0, - "learning_rate": 9.883795203612481e-05, - "loss": 0.2951, - "step": 9000 - }, - { - "epoch": 0.0, - "learning_rate": 9.895812329110132e-05, - "loss": 0.2865, - "step": 9100 - }, - { - "epoch": 0.0, - "learning_rate": 9.907697901604568e-05, - "loss": 0.2817, - "step": 9200 - }, - { - "epoch": 0.0, - "learning_rate": 9.919454770189497e-05, - "loss": 0.2769, - "step": 9300 - }, - { - "epoch": 0.0, - "learning_rate": 9.931085692393412e-05, - "loss": 0.2857, - "step": 9400 - }, - { - "epoch": 0.0, - "learning_rate": 9.942593338061702e-05, - "loss": 0.2677, - "step": 9500 - }, - { - "epoch": 0.0, - "learning_rate": 9.953980293035202e-05, - "loss": 0.2693, - "step": 9600 - }, - { - "epoch": 0.0, - "learning_rate": 9.965249062637823e-05, - "loss": 0.2803, - "step": 9700 - }, - { - "epoch": 0.0, - "learning_rate": 9.976402074985049e-05, - "loss": 0.2663, - "step": 9800 - }, - { - "epoch": 0.0, - "learning_rate": 9.987441684124227e-05, - "loss": 0.2725, - "step": 9900 - }, - { - "epoch": 0.0, - "learning_rate": 9.998370173016803e-05, - "loss": 0.2639, - "step": 10000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2637, - "step": 10100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2627, - "step": 10200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2681, - "step": 10300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2696, - "step": 10400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.263, - "step": 10500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2637, - "step": 10600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2607, - "step": 10700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2502, - "step": 10800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.252, - "step": 10900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2483, - "step": 11000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2449, - "step": 11100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2397, - "step": 11200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2481, - "step": 11300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2459, - "step": 11400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2437, - "step": 11500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2456, - "step": 11600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2518, - "step": 11700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2371, - "step": 11800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2379, - "step": 11900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2374, - "step": 12000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2409, - "step": 12100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2401, - "step": 12200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2376, - "step": 12300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2414, - "step": 12400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2348, - "step": 12500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2314, - "step": 12600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2223, - "step": 12700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2256, - "step": 12800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2287, - "step": 12900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2217, - "step": 13000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2317, - "step": 13100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2208, - "step": 13200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2282, - "step": 13300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2182, - "step": 13400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2171, - "step": 13500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2223, - "step": 13600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.217, - "step": 13700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2124, - "step": 13800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2205, - "step": 13900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2123, - "step": 14000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2186, - "step": 14100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2087, - "step": 14200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2165, - "step": 14300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2119, - "step": 14400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2051, - "step": 14500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2081, - "step": 14600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1978, - "step": 14700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.211, - "step": 14800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2126, - "step": 14900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1937, - "step": 15000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2059, - "step": 15100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2036, - "step": 15200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2041, - "step": 15300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2034, - "step": 15400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2027, - "step": 15500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2017, - "step": 15600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2008, - "step": 15700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1966, - "step": 15800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2037, - "step": 15900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2068, - "step": 16000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1993, - "step": 16100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.201, - "step": 16200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.2006, - "step": 16300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1987, - "step": 16400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1846, - "step": 16500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1887, - "step": 16600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1891, - "step": 16700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1941, - "step": 16800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1862, - "step": 16900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.192, - "step": 17000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1948, - "step": 17100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1971, - "step": 17200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1902, - "step": 17300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1848, - "step": 17400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1934, - "step": 17500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1867, - "step": 17600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1817, - "step": 17700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1854, - "step": 17800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1892, - "step": 17900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1913, - "step": 18000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1831, - "step": 18100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1882, - "step": 18200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1875, - "step": 18300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1892, - "step": 18400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1781, - "step": 18500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1839, - "step": 18600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1782, - "step": 18700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1799, - "step": 18800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1808, - "step": 18900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1859, - "step": 19000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.186, - "step": 19100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1852, - "step": 19200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1773, - "step": 19300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1695, - "step": 19400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1787, - "step": 19500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1699, - "step": 19600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1724, - "step": 19700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1739, - "step": 19800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1792, - "step": 19900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1771, - "step": 20000 - }, - { - "epoch": 0.0, - "eval_loss": 0.1444091796875, - "eval_runtime": 3165.6765, - "eval_samples_per_second": 355.287, - "eval_steps_per_second": 22.206, - "step": 20000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1747, - "step": 20100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1747, - "step": 20200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1726, - "step": 20300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1649, - "step": 20400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1669, - "step": 20500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1687, - "step": 20600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1764, - "step": 20700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1673, - "step": 20800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1702, - "step": 20900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1619, - "step": 21000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1684, - "step": 21100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.171, - "step": 21200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1681, - "step": 21300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1692, - "step": 21400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1702, - "step": 21500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1589, - "step": 21600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1663, - "step": 21700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1596, - "step": 21800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1571, - "step": 21900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1688, - "step": 22000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1648, - "step": 22100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1635, - "step": 22200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1634, - "step": 22300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.165, - "step": 22400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.165, - "step": 22500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1643, - "step": 22600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1677, - "step": 22700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1597, - "step": 22800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1576, - "step": 22900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1637, - "step": 23000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1579, - "step": 23100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1627, - "step": 23200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1542, - "step": 23300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1561, - "step": 23400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1592, - "step": 23500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1492, - "step": 23600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1515, - "step": 23700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1545, - "step": 23800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1608, - "step": 23900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1602, - "step": 24000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1487, - "step": 24100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1565, - "step": 24200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.15, - "step": 24300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.162, - "step": 24400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1586, - "step": 24500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1559, - "step": 24600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.167, - "step": 24700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1494, - "step": 24800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1534, - "step": 24900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1444, - "step": 25000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1579, - "step": 25100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1505, - "step": 25200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1455, - "step": 25300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1544, - "step": 25400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1466, - "step": 25500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.149, - "step": 25600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1454, - "step": 25700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1508, - "step": 25800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1475, - "step": 25900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.155, - "step": 26000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1457, - "step": 26100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1467, - "step": 26200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1421, - "step": 26300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1483, - "step": 26400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1441, - "step": 26500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1476, - "step": 26600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.151, - "step": 26700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1435, - "step": 26800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1469, - "step": 26900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1454, - "step": 27000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1371, - "step": 27100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1417, - "step": 27200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.151, - "step": 27300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1419, - "step": 27400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1366, - "step": 27500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1487, - "step": 27600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1434, - "step": 27700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1338, - "step": 27800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1492, - "step": 27900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1533, - "step": 28000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1468, - "step": 28100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1529, - "step": 28200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1463, - "step": 28300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1479, - "step": 28400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1377, - "step": 28500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1405, - "step": 28600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1449, - "step": 28700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1399, - "step": 28800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1438, - "step": 28900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.143, - "step": 29000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1349, - "step": 29100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1437, - "step": 29200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1401, - "step": 29300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1449, - "step": 29400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.131, - "step": 29500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1344, - "step": 29600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1379, - "step": 29700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1379, - "step": 29800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1343, - "step": 29900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1355, - "step": 30000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1357, - "step": 30100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1291, - "step": 30200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1401, - "step": 30300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1354, - "step": 30400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1371, - "step": 30500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1386, - "step": 30600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1348, - "step": 30700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1303, - "step": 30800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.134, - "step": 30900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.137, - "step": 31000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.133, - "step": 31100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1312, - "step": 31200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1303, - "step": 31300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1328, - "step": 31400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1362, - "step": 31500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1373, - "step": 31600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1269, - "step": 31700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1395, - "step": 31800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1364, - "step": 31900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.125, - "step": 32000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1342, - "step": 32100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1323, - "step": 32200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.129, - "step": 32300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1322, - "step": 32400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.131, - "step": 32500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.135, - "step": 32600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.124, - "step": 32700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1257, - "step": 32800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1314, - "step": 32900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 33000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1301, - "step": 33100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1284, - "step": 33200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1288, - "step": 33300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1258, - "step": 33400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 33500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1294, - "step": 33600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.121, - "step": 33700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1321, - "step": 33800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1225, - "step": 33900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1257, - "step": 34000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1278, - "step": 34100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1237, - "step": 34200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1244, - "step": 34300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1258, - "step": 34400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1212, - "step": 34500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1205, - "step": 34600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1255, - "step": 34700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1234, - "step": 34800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1286, - "step": 34900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1259, - "step": 35000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1211, - "step": 35100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.123, - "step": 35200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1221, - "step": 35300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1294, - "step": 35400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1186, - "step": 35500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1221, - "step": 35600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1276, - "step": 35700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1254, - "step": 35800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.123, - "step": 35900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1256, - "step": 36000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.122, - "step": 36100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1269, - "step": 36200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1195, - "step": 36300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1252, - "step": 36400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1204, - "step": 36500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1179, - "step": 36600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1239, - "step": 36700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1209, - "step": 36800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1193, - "step": 36900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1213, - "step": 37000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1192, - "step": 37100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1263, - "step": 37200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1199, - "step": 37300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.123, - "step": 37400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1162, - "step": 37500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1231, - "step": 37600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1166, - "step": 37700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1236, - "step": 37800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1215, - "step": 37900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1256, - "step": 38000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1201, - "step": 38100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1221, - "step": 38200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.118, - "step": 38300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1166, - "step": 38400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1133, - "step": 38500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1276, - "step": 38600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1208, - "step": 38700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.113, - "step": 38800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1182, - "step": 38900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1155, - "step": 39000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1215, - "step": 39100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1215, - "step": 39200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1167, - "step": 39300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1275, - "step": 39400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.114, - "step": 39500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.12, - "step": 39600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1173, - "step": 39700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1327, - "step": 39800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1156, - "step": 39900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1122, - "step": 40000 - }, - { - "epoch": 0.0, - "eval_loss": 0.09625244140625, - "eval_runtime": 3128.3661, - "eval_samples_per_second": 359.524, - "eval_steps_per_second": 22.471, - "step": 40000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1227, - "step": 40100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.116, - "step": 40200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1083, - "step": 40300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1189, - "step": 40400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1126, - "step": 40500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1162, - "step": 40600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1152, - "step": 40700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1142, - "step": 40800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1118, - "step": 40900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1138, - "step": 41000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1139, - "step": 41100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1173, - "step": 41200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1101, - "step": 41300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1185, - "step": 41400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1135, - "step": 41500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1141, - "step": 41600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1159, - "step": 41700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1084, - "step": 41800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.113, - "step": 41900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1142, - "step": 42000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1116, - "step": 42100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1209, - "step": 42200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1188, - "step": 42300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1173, - "step": 42400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1121, - "step": 42500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1077, - "step": 42600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1088, - "step": 42700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1125, - "step": 42800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1067, - "step": 42900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.11, - "step": 43000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1124, - "step": 43100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1125, - "step": 43200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1093, - "step": 43300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.115, - "step": 43400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.111, - "step": 43500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1099, - "step": 43600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1125, - "step": 43700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.112, - "step": 43800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.104, - "step": 43900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1082, - "step": 44000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1118, - "step": 44100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1046, - "step": 44200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1124, - "step": 44300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1116, - "step": 44400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1079, - "step": 44500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1048, - "step": 44600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1104, - "step": 44700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1105, - "step": 44800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1136, - "step": 44900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1049, - "step": 45000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1099, - "step": 45100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1074, - "step": 45200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1102, - "step": 45300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1092, - "step": 45400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1077, - "step": 45500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1082, - "step": 45600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1012, - "step": 45700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1106, - "step": 45800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1107, - "step": 45900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1124, - "step": 46000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1043, - "step": 46100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1054, - "step": 46200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1041, - "step": 46300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1011, - "step": 46400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1116, - "step": 46500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1089, - "step": 46600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1078, - "step": 46700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1094, - "step": 46800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1072, - "step": 46900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.105, - "step": 47000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1075, - "step": 47100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1046, - "step": 47200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1093, - "step": 47300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1102, - "step": 47400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1052, - "step": 47500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1066, - "step": 47600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1008, - "step": 47700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1058, - "step": 47800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1065, - "step": 47900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1042, - "step": 48000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1064, - "step": 48100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1027, - "step": 48200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1051, - "step": 48300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.106, - "step": 48400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.102, - "step": 48500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1063, - "step": 48600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1024, - "step": 48700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1065, - "step": 48800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1083, - "step": 48900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1072, - "step": 49000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1064, - "step": 49100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1048, - "step": 49200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1008, - "step": 49300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.106, - "step": 49400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0995, - "step": 49500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1022, - "step": 49600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1019, - "step": 49700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1029, - "step": 49800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1014, - "step": 49900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1032, - "step": 50000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0982, - "step": 50100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1027, - "step": 50200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1084, - "step": 50300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1008, - "step": 50400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1026, - "step": 50500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1018, - "step": 50600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0991, - "step": 50700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1074, - "step": 50800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1076, - "step": 50900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1056, - "step": 51000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.099, - "step": 51100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1013, - "step": 51200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0939, - "step": 51300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1038, - "step": 51400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0967, - "step": 51500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1051, - "step": 51600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1023, - "step": 51700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1034, - "step": 51800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.103, - "step": 51900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1015, - "step": 52000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1018, - "step": 52100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0962, - "step": 52200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.103, - "step": 52300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0988, - "step": 52400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1014, - "step": 52500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0948, - "step": 52600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0998, - "step": 52700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1071, - "step": 52800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1007, - "step": 52900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1041, - "step": 53000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0985, - "step": 53100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1001, - "step": 53200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1007, - "step": 53300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1011, - "step": 53400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0966, - "step": 53500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1024, - "step": 53600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.102, - "step": 53700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1035, - "step": 53800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0976, - "step": 53900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1008, - "step": 54000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0953, - "step": 54100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0936, - "step": 54200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0994, - "step": 54300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0998, - "step": 54400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.097, - "step": 54500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1021, - "step": 54600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0985, - "step": 54700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1005, - "step": 54800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1027, - "step": 54900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.097, - "step": 55000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0953, - "step": 55100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0949, - "step": 55200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1013, - "step": 55300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0994, - "step": 55400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0999, - "step": 55500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0965, - "step": 55600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1009, - "step": 55700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0952, - "step": 55800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0986, - "step": 55900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0942, - "step": 56000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.102, - "step": 56100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1031, - "step": 56200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.099, - "step": 56300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0951, - "step": 56400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.097, - "step": 56500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1002, - "step": 56600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0953, - "step": 56700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1002, - "step": 56800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0936, - "step": 56900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0987, - "step": 57000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.093, - "step": 57100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0901, - "step": 57200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1005, - "step": 57300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0977, - "step": 57400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0982, - "step": 57500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0979, - "step": 57600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0974, - "step": 57700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0979, - "step": 57800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0922, - "step": 57900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0996, - "step": 58000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0926, - "step": 58100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.095, - "step": 58200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1011, - "step": 58300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0937, - "step": 58400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1016, - "step": 58500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1028, - "step": 58600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0952, - "step": 58700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 58800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0976, - "step": 58900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0959, - "step": 59000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0931, - "step": 59100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0934, - "step": 59200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0995, - "step": 59300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0909, - "step": 59400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0941, - "step": 59500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.098, - "step": 59600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 59700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0927, - "step": 59800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0954, - "step": 59900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0928, - "step": 60000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0804443359375, - "eval_runtime": 3083.0678, - "eval_samples_per_second": 364.806, - "eval_steps_per_second": 22.801, - "step": 60000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0972, - "step": 60100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0896, - "step": 60200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0929, - "step": 60300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0896, - "step": 60400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0945, - "step": 60500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1001, - "step": 60600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0927, - "step": 60700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0905, - "step": 60800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0933, - "step": 60900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0923, - "step": 61000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0931, - "step": 61100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0942, - "step": 61200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.092, - "step": 61300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0898, - "step": 61400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0899, - "step": 61500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0901, - "step": 61600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.095, - "step": 61700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0947, - "step": 61800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0957, - "step": 61900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0982, - "step": 62000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0922, - "step": 62100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0899, - "step": 62200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0953, - "step": 62300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0973, - "step": 62400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0908, - "step": 62500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0919, - "step": 62600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0958, - "step": 62700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0945, - "step": 62800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0951, - "step": 62900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0951, - "step": 63000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0874, - "step": 63100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0903, - "step": 63200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0898, - "step": 63300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0939, - "step": 63400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0931, - "step": 63500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0896, - "step": 63600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0945, - "step": 63700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0903, - "step": 63800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0869, - "step": 63900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0886, - "step": 64000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0927, - "step": 64100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0986, - "step": 64200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0952, - "step": 64300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0915, - "step": 64400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0871, - "step": 64500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0879, - "step": 64600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0926, - "step": 64700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0889, - "step": 64800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0892, - "step": 64900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0961, - "step": 65000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0918, - "step": 65100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0921, - "step": 65200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0894, - "step": 65300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.088, - "step": 65400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0918, - "step": 65500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0907, - "step": 65600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0932, - "step": 65700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0855, - "step": 65800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0895, - "step": 65900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0881, - "step": 66000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0916, - "step": 66100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0896, - "step": 66200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0916, - "step": 66300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0878, - "step": 66400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0891, - "step": 66500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0911, - "step": 66600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0932, - "step": 66700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0892, - "step": 66800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0886, - "step": 66900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0879, - "step": 67000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.093, - "step": 67100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0939, - "step": 67200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0993, - "step": 67300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0933, - "step": 67400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0907, - "step": 67500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 67600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0865, - "step": 67700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0905, - "step": 67800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0839, - "step": 67900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0878, - "step": 68000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0879, - "step": 68100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.09, - "step": 68200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.086, - "step": 68300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0871, - "step": 68400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0855, - "step": 68500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0887, - "step": 68600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0844, - "step": 68700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0856, - "step": 68800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0848, - "step": 68900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0867, - "step": 69000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0859, - "step": 69100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 69200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0939, - "step": 69300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0895, - "step": 69400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0911, - "step": 69500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0832, - "step": 69600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.085, - "step": 69700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.092, - "step": 69800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0909, - "step": 69900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0873, - "step": 70000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0859, - "step": 70100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0826, - "step": 70200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0832, - "step": 70300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.089, - "step": 70400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 70500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0865, - "step": 70600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0873, - "step": 70700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0846, - "step": 70800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0857, - "step": 70900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0885, - "step": 71000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0854, - "step": 71100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0963, - "step": 71200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0833, - "step": 71300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0865, - "step": 71400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0865, - "step": 71500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0926, - "step": 71600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0859, - "step": 71700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0857, - "step": 71800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.087, - "step": 71900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0875, - "step": 72000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0846, - "step": 72100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.081, - "step": 72200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0831, - "step": 72300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0875, - "step": 72400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.082, - "step": 72500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0863, - "step": 72600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0893, - "step": 72700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0873, - "step": 72800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0873, - "step": 72900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0898, - "step": 73000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0837, - "step": 73100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0923, - "step": 73200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 73300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 73400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0839, - "step": 73500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0887, - "step": 73600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0818, - "step": 73700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 73800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0868, - "step": 73900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0816, - "step": 74000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0802, - "step": 74100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0896, - "step": 74200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0874, - "step": 74300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0841, - "step": 74400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0832, - "step": 74500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0847, - "step": 74600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.086, - "step": 74700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0903, - "step": 74800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 74900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0793, - "step": 75000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0889, - "step": 75100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0845, - "step": 75200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0922, - "step": 75300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0835, - "step": 75400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0795, - "step": 75500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.085, - "step": 75600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0769, - "step": 75700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0844, - "step": 75800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.09, - "step": 75900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0785, - "step": 76000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0865, - "step": 76100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0891, - "step": 76200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 76300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 76400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0851, - "step": 76500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0856, - "step": 76600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0924, - "step": 76700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0799, - "step": 76800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0833, - "step": 76900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0856, - "step": 77000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0845, - "step": 77100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0907, - "step": 77200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0842, - "step": 77300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0821, - "step": 77400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0864, - "step": 77500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0823, - "step": 77600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0844, - "step": 77700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0802, - "step": 77800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0901, - "step": 77900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0837, - "step": 78000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0862, - "step": 78100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0845, - "step": 78200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0805, - "step": 78300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0875, - "step": 78400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 78500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 78600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0837, - "step": 78700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0825, - "step": 78800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0836, - "step": 78900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0815, - "step": 79000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 79100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0887, - "step": 79200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0817, - "step": 79300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0824, - "step": 79400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0819, - "step": 79500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.082, - "step": 79600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0815, - "step": 79700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.083, - "step": 79800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.088, - "step": 79900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0826, - "step": 80000 - }, - { - "epoch": 0.0, - "eval_loss": 0.069091796875, - "eval_runtime": 3095.1044, - "eval_samples_per_second": 363.388, - "eval_steps_per_second": 22.712, - "step": 80000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0816, - "step": 80100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0808, - "step": 80200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.09, - "step": 80300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0787, - "step": 80400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0847, - "step": 80500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0852, - "step": 80600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 80700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0847, - "step": 80800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.085, - "step": 80900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0835, - "step": 81000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0761, - "step": 81100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0825, - "step": 81200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0841, - "step": 81300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0803, - "step": 81400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0798, - "step": 81500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0787, - "step": 81600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0836, - "step": 81700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.077, - "step": 81800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0832, - "step": 81900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.077, - "step": 82000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.082, - "step": 82100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0804, - "step": 82200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0877, - "step": 82300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0864, - "step": 82400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0841, - "step": 82500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0839, - "step": 82600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0797, - "step": 82700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 82800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0785, - "step": 82900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0788, - "step": 83000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0807, - "step": 83100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0874, - "step": 83200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 83300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0821, - "step": 83400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 83500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0805, - "step": 83600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0746, - "step": 83700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0806, - "step": 83800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0775, - "step": 83900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0818, - "step": 84000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0843, - "step": 84100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0822, - "step": 84200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 84300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0804, - "step": 84400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0835, - "step": 84500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0799, - "step": 84600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0835, - "step": 84700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0766, - "step": 84800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 84900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0784, - "step": 85000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0824, - "step": 85100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0784, - "step": 85200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0796, - "step": 85300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0774, - "step": 85400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0797, - "step": 85500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 85600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0777, - "step": 85700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0787, - "step": 85800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0748, - "step": 85900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0782, - "step": 86000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0798, - "step": 86100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0788, - "step": 86200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.08, - "step": 86300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0837, - "step": 86400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0786, - "step": 86500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0787, - "step": 86600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.074, - "step": 86700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0785, - "step": 86800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0818, - "step": 86900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0758, - "step": 87000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0798, - "step": 87100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0822, - "step": 87200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0754, - "step": 87300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0783, - "step": 87400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0802, - "step": 87500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0792, - "step": 87600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0797, - "step": 87700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0816, - "step": 87800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0795, - "step": 87900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0768, - "step": 88000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0864, - "step": 88100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0792, - "step": 88200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 88300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0791, - "step": 88400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0812, - "step": 88500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0795, - "step": 88600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0769, - "step": 88700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.082, - "step": 88800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0797, - "step": 88900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0777, - "step": 89000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0746, - "step": 89100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.083, - "step": 89200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 89300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0797, - "step": 89400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0752, - "step": 89500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0827, - "step": 89600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0781, - "step": 89700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0779, - "step": 89800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 89900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.081, - "step": 90000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0789, - "step": 90100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.077, - "step": 90200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0745, - "step": 90300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0743, - "step": 90400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0816, - "step": 90500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0813, - "step": 90600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0808, - "step": 90700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0837, - "step": 90800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0783, - "step": 90900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0757, - "step": 91000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0778, - "step": 91100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0781, - "step": 91200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0786, - "step": 91300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0802, - "step": 91400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0781, - "step": 91500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0769, - "step": 91600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0809, - "step": 91700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0722, - "step": 91800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0815, - "step": 91900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 92000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0756, - "step": 92100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0775, - "step": 92200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0781, - "step": 92300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0814, - "step": 92400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0752, - "step": 92500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0802, - "step": 92600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.074, - "step": 92700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 92800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0826, - "step": 92900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0777, - "step": 93000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0731, - "step": 93100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0799, - "step": 93200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0806, - "step": 93300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.083, - "step": 93400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0773, - "step": 93500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0758, - "step": 93600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0765, - "step": 93700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0792, - "step": 93800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0814, - "step": 93900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0786, - "step": 94000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0769, - "step": 94100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.078, - "step": 94200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0796, - "step": 94300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 94400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.078, - "step": 94500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0768, - "step": 94600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0736, - "step": 94700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0752, - "step": 94800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0788, - "step": 94900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.077, - "step": 95000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0716, - "step": 95100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0765, - "step": 95200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0753, - "step": 95300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.075, - "step": 95400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.073, - "step": 95500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0771, - "step": 95600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0826, - "step": 95700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0764, - "step": 95800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0772, - "step": 95900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0757, - "step": 96000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 96100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0803, - "step": 96200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0733, - "step": 96300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0752, - "step": 96400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0802, - "step": 96500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0746, - "step": 96600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 96700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0798, - "step": 96800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0701, - "step": 96900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0739, - "step": 97000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.077, - "step": 97100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0753, - "step": 97200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0764, - "step": 97300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0764, - "step": 97400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0727, - "step": 97500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0774, - "step": 97600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0771, - "step": 97700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0744, - "step": 97800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0752, - "step": 97900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0741, - "step": 98000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0771, - "step": 98100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0834, - "step": 98200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0753, - "step": 98300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0726, - "step": 98400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 98500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0722, - "step": 98600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0754, - "step": 98700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0772, - "step": 98800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0734, - "step": 98900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 99000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0771, - "step": 99100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0762, - "step": 99200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.075, - "step": 99300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0765, - "step": 99400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 99500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0774, - "step": 99600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0758, - "step": 99700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0742, - "step": 99800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 99900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0735, - "step": 100000 - }, - { - "epoch": 0.0, - "eval_loss": 0.062103271484375, - "eval_runtime": 3145.6437, - "eval_samples_per_second": 357.549, - "eval_steps_per_second": 22.347, - "step": 100000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0764, - "step": 100100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0765, - "step": 100200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0761, - "step": 100300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 100400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 100500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 100600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0697, - "step": 100700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0779, - "step": 100800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0771, - "step": 100900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0784, - "step": 101000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0718, - "step": 101100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0723, - "step": 101200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0812, - "step": 101300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 101400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0755, - "step": 101500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0732, - "step": 101600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0709, - "step": 101700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0804, - "step": 101800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 101900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0744, - "step": 102000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 102100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.074, - "step": 102200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0747, - "step": 102300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.074, - "step": 102400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0742, - "step": 102500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0695, - "step": 102600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0751, - "step": 102700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0744, - "step": 102800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0745, - "step": 102900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0703, - "step": 103000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0775, - "step": 103100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0719, - "step": 103200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 103300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.072, - "step": 103400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0717, - "step": 103500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.074, - "step": 103600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0744, - "step": 103700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0676, - "step": 103800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0844, - "step": 103900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0718, - "step": 104000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0766, - "step": 104100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0723, - "step": 104200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0753, - "step": 104300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0799, - "step": 104400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0765, - "step": 104500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0662, - "step": 104600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0772, - "step": 104700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0682, - "step": 104800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0661, - "step": 104900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 105000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0674, - "step": 105100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0764, - "step": 105200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0739, - "step": 105300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0732, - "step": 105400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0742, - "step": 105500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0722, - "step": 105600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0659, - "step": 105700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0722, - "step": 105800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.077, - "step": 105900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0724, - "step": 106000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0714, - "step": 106100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0719, - "step": 106200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0718, - "step": 106300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0721, - "step": 106400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0766, - "step": 106500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0693, - "step": 106600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0719, - "step": 106700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0719, - "step": 106800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0691, - "step": 106900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0768, - "step": 107000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0722, - "step": 107100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 107200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0692, - "step": 107300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.076, - "step": 107400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0744, - "step": 107500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0786, - "step": 107600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0717, - "step": 107700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0685, - "step": 107800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0721, - "step": 107900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0719, - "step": 108000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0674, - "step": 108100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0729, - "step": 108200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0689, - "step": 108300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.065, - "step": 108400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0721, - "step": 108500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0714, - "step": 108600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0725, - "step": 108700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0756, - "step": 108800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0716, - "step": 108900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.079, - "step": 109000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0713, - "step": 109100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0713, - "step": 109200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0739, - "step": 109300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 109400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0706, - "step": 109500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0715, - "step": 109600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0755, - "step": 109700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0757, - "step": 109800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0726, - "step": 109900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 110000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0672, - "step": 110100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0732, - "step": 110200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0748, - "step": 110300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0746, - "step": 110400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0801, - "step": 110500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0683, - "step": 110600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0739, - "step": 110700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0705, - "step": 110800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0752, - "step": 110900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0685, - "step": 111000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0713, - "step": 111100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0698, - "step": 111200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0705, - "step": 111300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0735, - "step": 111400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0735, - "step": 111500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 111600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.075, - "step": 111700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.068, - "step": 111800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.073, - "step": 111900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0706, - "step": 112000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 112100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0711, - "step": 112200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0697, - "step": 112300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0721, - "step": 112400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 112500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0697, - "step": 112600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0705, - "step": 112700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0701, - "step": 112800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0732, - "step": 112900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 113000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.071, - "step": 113100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.07, - "step": 113200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0721, - "step": 113300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0738, - "step": 113400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0701, - "step": 113500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.072, - "step": 113600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0744, - "step": 113700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0705, - "step": 113800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0719, - "step": 113900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0642, - "step": 114000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0737, - "step": 114100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0759, - "step": 114200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 114300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0672, - "step": 114400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0745, - "step": 114500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0682, - "step": 114600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.07, - "step": 114700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0678, - "step": 114800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0642, - "step": 114900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0747, - "step": 115000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0736, - "step": 115100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0705, - "step": 115200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0669, - "step": 115300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0662, - "step": 115400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0693, - "step": 115500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0727, - "step": 115600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0721, - "step": 115700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.073, - "step": 115800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0724, - "step": 115900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0681, - "step": 116000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0671, - "step": 116100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0689, - "step": 116200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0692, - "step": 116300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0712, - "step": 116400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0681, - "step": 116500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0712, - "step": 116600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0686, - "step": 116700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0666, - "step": 116800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0685, - "step": 116900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0684, - "step": 117000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0688, - "step": 117100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 117200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0702, - "step": 117300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0727, - "step": 117400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0672, - "step": 117500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0681, - "step": 117600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0712, - "step": 117700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0743, - "step": 117800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0708, - "step": 117900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0689, - "step": 118000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0771, - "step": 118100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0703, - "step": 118200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.072, - "step": 118300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0676, - "step": 118400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0674, - "step": 118500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0679, - "step": 118600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.068, - "step": 118700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0675, - "step": 118800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0658, - "step": 118900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0699, - "step": 119000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0689, - "step": 119100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0751, - "step": 119200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0724, - "step": 119300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0687, - "step": 119400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.067, - "step": 119500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0722, - "step": 119600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0633, - "step": 119700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.067, - "step": 119800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0732, - "step": 119900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.067, - "step": 120000 - }, - { - "epoch": 0.0, - "eval_loss": 0.058624267578125, - "eval_runtime": 3109.0534, - "eval_samples_per_second": 361.757, - "eval_steps_per_second": 22.61, - "step": 120000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0644, - "step": 120100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0633, - "step": 120200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0626, - "step": 120300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0616, - "step": 120400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0595, - "step": 120500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0587, - "step": 120600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.059, - "step": 120700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0627, - "step": 120800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0616, - "step": 120900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0568, - "step": 121000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0607, - "step": 121100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0576, - "step": 121200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 121300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.058, - "step": 121400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0616, - "step": 121500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 121600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0625, - "step": 121700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0605, - "step": 121800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0609, - "step": 121900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 122000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0617, - "step": 122100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.06, - "step": 122200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0622, - "step": 122300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0593, - "step": 122400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0598, - "step": 122500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0592, - "step": 122600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0578, - "step": 122700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0634, - "step": 122800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0628, - "step": 122900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0592, - "step": 123000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0601, - "step": 123100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0627, - "step": 123200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0582, - "step": 123300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0587, - "step": 123400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0569, - "step": 123500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0608, - "step": 123600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 123700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0626, - "step": 123800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0598, - "step": 123900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0609, - "step": 124000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0618, - "step": 124100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0599, - "step": 124200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0597, - "step": 124300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0594, - "step": 124400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0583, - "step": 124500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0642, - "step": 124600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0586, - "step": 124700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 124800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0593, - "step": 124900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.063, - "step": 125000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0605, - "step": 125100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.062, - "step": 125200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0587, - "step": 125300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0592, - "step": 125400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0584, - "step": 125500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0626, - "step": 125600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0643, - "step": 125700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0578, - "step": 125800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0586, - "step": 125900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.06, - "step": 126000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.059, - "step": 126100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.061, - "step": 126200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0618, - "step": 126300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0597, - "step": 126400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0608, - "step": 126500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0644, - "step": 126600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 126700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0568, - "step": 126800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0581, - "step": 126900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0594, - "step": 127000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0568, - "step": 127100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.059, - "step": 127200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0612, - "step": 127300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 127400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0594, - "step": 127500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0596, - "step": 127600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0596, - "step": 127700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.058, - "step": 127800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0602, - "step": 127900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0587, - "step": 128000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0595, - "step": 128100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0585, - "step": 128200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0568, - "step": 128300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 128400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0589, - "step": 128500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0555, - "step": 128600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0595, - "step": 128700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0577, - "step": 128800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0597, - "step": 128900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0591, - "step": 129000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.056, - "step": 129100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 129200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0585, - "step": 129300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0587, - "step": 129400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 129500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0581, - "step": 129600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 129700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0555, - "step": 129800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.058, - "step": 129900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0567, - "step": 130000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0579, - "step": 130100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0598, - "step": 130200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.06, - "step": 130300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.06, - "step": 130400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0566, - "step": 130500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0596, - "step": 130600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0579, - "step": 130700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0574, - "step": 130800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0571, - "step": 130900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0565, - "step": 131000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0578, - "step": 131100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0583, - "step": 131200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0563, - "step": 131300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0575, - "step": 131400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 131500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0596, - "step": 131600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0576, - "step": 131700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0576, - "step": 131800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 131900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0564, - "step": 132000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0578, - "step": 132100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0572, - "step": 132200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0572, - "step": 132300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0564, - "step": 132400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0604, - "step": 132500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0566, - "step": 132600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0564, - "step": 132700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0584, - "step": 132800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0573, - "step": 132900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0615, - "step": 133000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0573, - "step": 133100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0569, - "step": 133200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.057, - "step": 133300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 133400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0565, - "step": 133500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0576, - "step": 133600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0581, - "step": 133700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0564, - "step": 133800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0602, - "step": 133900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0568, - "step": 134000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0566, - "step": 134100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0581, - "step": 134200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 134300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 134400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0566, - "step": 134500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0575, - "step": 134600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0567, - "step": 134700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 134800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 134900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0555, - "step": 135000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0584, - "step": 135100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0569, - "step": 135200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0559, - "step": 135300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 135400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0542, - "step": 135500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0578, - "step": 135600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 135700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 135800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0563, - "step": 135900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0588, - "step": 136000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 136100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 136200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0585, - "step": 136300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 136400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 136500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 136600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.058, - "step": 136700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 136800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 136900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0578, - "step": 137000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.054, - "step": 137100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0551, - "step": 137200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0571, - "step": 137300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 137400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 137500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0546, - "step": 137600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0573, - "step": 137700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0557, - "step": 137800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0604, - "step": 137900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0559, - "step": 138000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0557, - "step": 138100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0567, - "step": 138200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0585, - "step": 138300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 138400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0579, - "step": 138500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 138600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.058, - "step": 138700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0576, - "step": 138800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0554, - "step": 138900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.058, - "step": 139000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 139100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0536, - "step": 139200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 139300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0554, - "step": 139400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0545, - "step": 139500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 139600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0557, - "step": 139700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0568, - "step": 139800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 139900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 140000 - }, - { - "epoch": 1.0, - "eval_loss": 0.04754638671875, - "eval_runtime": 3429.7025, - "eval_samples_per_second": 327.936, - "eval_steps_per_second": 20.496, - "step": 140000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0557, - "step": 140100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0569, - "step": 140200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.055, - "step": 140300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 140400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 140500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0575, - "step": 140600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 140700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 140800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 140900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0557, - "step": 141000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.053, - "step": 141100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 141200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0574, - "step": 141300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 141400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 141500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0559, - "step": 141600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 141700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 141800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0566, - "step": 141900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 142000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 142100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 142200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 142300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0553, - "step": 142400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 142500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0561, - "step": 142600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 142700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0557, - "step": 142800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.054, - "step": 142900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 143000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0553, - "step": 143100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0569, - "step": 143200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 143300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.056, - "step": 143400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0556, - "step": 143500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 143600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0545, - "step": 143700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0559, - "step": 143800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0545, - "step": 143900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0542, - "step": 144000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0563, - "step": 144100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0542, - "step": 144200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 144300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.056, - "step": 144400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 144500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0532, - "step": 144600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 144700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0548, - "step": 144800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0543, - "step": 144900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0554, - "step": 145000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0541, - "step": 145100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 145200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0541, - "step": 145300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 145400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 145500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.056, - "step": 145600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.051, - "step": 145700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0563, - "step": 145800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 145900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.055, - "step": 146000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0536, - "step": 146100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.054, - "step": 146200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0546, - "step": 146300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0543, - "step": 146400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 146500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0551, - "step": 146600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.053, - "step": 146700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0542, - "step": 146800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0543, - "step": 146900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0574, - "step": 147000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 147100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0531, - "step": 147200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 147300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 147400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0558, - "step": 147500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0565, - "step": 147600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 147700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 147800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 147900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 148000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0548, - "step": 148100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0545, - "step": 148200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 148300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0524, - "step": 148400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 148500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0536, - "step": 148600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 148700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0547, - "step": 148800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0546, - "step": 148900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0576, - "step": 149000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 149100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 149200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 149300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 149400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 149500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0564, - "step": 149600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0546, - "step": 149700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 149800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 149900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.054, - "step": 150000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 150100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0523, - "step": 150200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 150300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0522, - "step": 150400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 150500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.055, - "step": 150600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 150700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0536, - "step": 150800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0524, - "step": 150900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0548, - "step": 151000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 151100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 151200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0523, - "step": 151300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.051, - "step": 151400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 151500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0519, - "step": 151600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0509, - "step": 151700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0541, - "step": 151800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0548, - "step": 151900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 152000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0529, - "step": 152100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0552, - "step": 152200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 152300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0546, - "step": 152400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0553, - "step": 152500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 152600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0562, - "step": 152700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0514, - "step": 152800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0505, - "step": 152900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 153000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0539, - "step": 153100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 153200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0555, - "step": 153300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0531, - "step": 153400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0551, - "step": 153500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 153600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0539, - "step": 153700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 153800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 153900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0519, - "step": 154000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 154100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 154200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 154300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.054, - "step": 154400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0529, - "step": 154500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 154600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0524, - "step": 154700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 154800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 154900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 155000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0554, - "step": 155100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0524, - "step": 155200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0532, - "step": 155300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 155400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0514, - "step": 155500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 155600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0532, - "step": 155700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 155800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 155900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.053, - "step": 156000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 156100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0522, - "step": 156200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 156300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 156400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0541, - "step": 156500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 156600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0514, - "step": 156700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0536, - "step": 156800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 156900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 157000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0514, - "step": 157100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 157200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0531, - "step": 157300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 157400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 157500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 157600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0494, - "step": 157700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 157800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 157900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 158000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 158100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 158200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 158300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0532, - "step": 158400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 158500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 158600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0523, - "step": 158700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 158800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 158900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0535, - "step": 159000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.054, - "step": 159100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 159200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 159300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 159400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 159500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 159600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0526, - "step": 159700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 159800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 159900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 160000 - }, - { - "epoch": 1.0, - "eval_loss": 0.044464111328125, - "eval_runtime": 3391.5213, - "eval_samples_per_second": 331.628, - "eval_steps_per_second": 20.727, - "step": 160000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0509, - "step": 160100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 160200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0543, - "step": 160300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.053, - "step": 160400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 160500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 160600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 160700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 160800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 160900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0494, - "step": 161000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0542, - "step": 161100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 161200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0537, - "step": 161300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 161400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 161500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 161600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 161700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0523, - "step": 161800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.053, - "step": 161900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 162000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 162100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0526, - "step": 162200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0549, - "step": 162300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 162400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 162500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0513, - "step": 162600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0514, - "step": 162700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 162800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 162900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 163000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0529, - "step": 163100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0513, - "step": 163200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 163300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 163400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0518, - "step": 163500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 163600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 163700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 163800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0505, - "step": 163900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 164000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 164100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 164200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 164300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 164400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 164500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0509, - "step": 164600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0505, - "step": 164700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 164800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 164900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 165000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0529, - "step": 165100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 165200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 165300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 165400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 165500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 165600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0487, - "step": 165700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0522, - "step": 165800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0519, - "step": 165900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 166000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 166100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 166200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 166300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 166400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 166500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0494, - "step": 166600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 166700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 166800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 166900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 167000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 167100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 167200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0529, - "step": 167300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 167400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 167500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 167600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 167700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 167800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 167900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0513, - "step": 168000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 168100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 168200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 168300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0517, - "step": 168400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0534, - "step": 168500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0524, - "step": 168600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 168700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 168800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0505, - "step": 168900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.051, - "step": 169000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 169100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 169200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.053, - "step": 169300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 169400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 169500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0522, - "step": 169600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 169700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 169800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 169900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 170000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 170100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 170200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0542, - "step": 170300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0531, - "step": 170400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 170500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 170600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0532, - "step": 170700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 170800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 170900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0528, - "step": 171000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 171100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 171200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 171300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 171400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 171500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 171600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 171700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0518, - "step": 171800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 171900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0522, - "step": 172000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 172100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 172200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 172300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0518, - "step": 172400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 172500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 172600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 172700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 172800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 172900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 173000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 173100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 173200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 173300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 173400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0519, - "step": 173500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 173600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 173700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 173800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 173900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 174000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0505, - "step": 174100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 174200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 174300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 174400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 174500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 174600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 174700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 174800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 174900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0518, - "step": 175000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0531, - "step": 175100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 175200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0495, - "step": 175300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 175400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 175500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 175600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 175700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 175800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 175900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 176000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 176100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 176200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 176300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 176400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 176500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 176600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 176700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0513, - "step": 176800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0525, - "step": 176900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 177000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 177100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 177200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 177300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 177400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 177500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0533, - "step": 177600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0505, - "step": 177700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 177800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 177900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 178000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 178100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 178200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 178300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 178400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 178500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0487, - "step": 178600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 178700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 178800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0524, - "step": 178900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 179000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 179100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 179200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 179300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 179400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 179500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 179600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 179700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 179800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 179900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 180000 - }, - { - "epoch": 1.0, - "eval_loss": 0.0428466796875, - "eval_runtime": 3293.9688, - "eval_samples_per_second": 341.449, - "eval_steps_per_second": 21.341, - "step": 180000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 180100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 180200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0518, - "step": 180300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 180400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0518, - "step": 180500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 180600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 180700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 180800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 180900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 181000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 181100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 181200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 181300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 181400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 181500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 181600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 181700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 181800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0494, - "step": 181900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 182000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 182100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 182200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 182300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 182400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 182500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0509, - "step": 182600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0516, - "step": 182700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 182800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0487, - "step": 182900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 183000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 183100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 183200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 183300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 183400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 183500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 183600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 183700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 183800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 183900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 184000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 184100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 184200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 184300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0494, - "step": 184400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 184500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 184600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 184700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 184800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 184900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0495, - "step": 185000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 185100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0515, - "step": 185200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 185300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 185400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0495, - "step": 185500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 185600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 185700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 185800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 185900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 186000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 186100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 186200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 186300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 186400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 186500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 186600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.05, - "step": 186700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 186800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 186900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 187000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 187100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 187200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 187300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 187400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 187500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0495, - "step": 187600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0511, - "step": 187700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 187800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 187900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0508, - "step": 188000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 188100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 188200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 188300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 188400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 188500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 188600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 188700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 188800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 188900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.052, - "step": 189000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 189100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 189200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 189300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 189400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 189500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0513, - "step": 189600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 189700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 189800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 189900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 190000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 190100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 190200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0487, - "step": 190300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 190400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 190500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 190600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0487, - "step": 190700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 190800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 190900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 191000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 191100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 191200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 191300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 191400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0522, - "step": 191500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 191600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 191700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 191800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 191900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 192000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 192100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 192200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 192300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 192400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 192500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 192600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 192700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 192800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 192900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 193000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 193100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 193200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 193300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 193400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 193500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 193600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 193700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 193800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 193900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 194000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0512, - "step": 194100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 194200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 194300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 194400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 194500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 194600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 194700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 194800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 194900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 195000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 195100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0495, - "step": 195200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 195300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0521, - "step": 195400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 195500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 195600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 195700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 195800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 195900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0493, - "step": 196000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 196100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 196200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 196300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 196400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 196500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 196600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 196700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 196800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 196900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 197000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 197100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 197200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 197300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 197400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 197500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 197600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 197700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 197800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 197900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 198000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 198100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 198200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 198300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 198400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 198500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 198600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 198700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 198800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 198900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 199000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 199100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 199200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 199300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0499, - "step": 199400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 199500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 199600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 199700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 199800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 199900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 200000 - }, - { - "epoch": 1.0, - "eval_loss": 0.041534423828125, - "eval_runtime": 3291.5411, - "eval_samples_per_second": 341.701, - "eval_steps_per_second": 21.357, - "step": 200000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 200100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 200200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 200300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 200400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 200500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 200600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 200700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0495, - "step": 200800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 200900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 201000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 201100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 201200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 201300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 201400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 201500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 201600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 201700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 201800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 201900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 202000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 202100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 202200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 202300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 202400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 202500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 202600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 202700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 202800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0487, - "step": 202900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 203000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 203100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 203200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 203300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 203400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 203500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 203600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 203700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 203800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 203900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 204000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 204100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 204200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 204300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 204400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 204500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 204600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 204700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 204800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 204900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 205000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 205100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 205200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 205300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 205400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 205500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0497, - "step": 205600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 205700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 205800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0496, - "step": 205900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 206000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 206100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 206200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 206300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 206400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 206500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 206600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 206700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 206800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 206900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 207000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 207100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 207200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 207300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0484, - "step": 207400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 207500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 207600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 207700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 207800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 207900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 208000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0501, - "step": 208100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 208200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 208300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 208400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0494, - "step": 208500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 208600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 208700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 208800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 208900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 209000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 209100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 209200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 209300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 209400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 209500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 209600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 209700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 209800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 209900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 210000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 210100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 210200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 210300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 210400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 210500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 210600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 210700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 210800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 210900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 211000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 211100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 211200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 211300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 211400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 211500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 211600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 211700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 211800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 211900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 212000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 212100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 212200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 212300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 212400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 212500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 212600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 212700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 212800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 212900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 213000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 213100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 213200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 213300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 213400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 213500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 213600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 213700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 213800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 213900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 214000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 214100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 214200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.048, - "step": 214300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 214400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 214500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 214600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 214700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 214800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 214900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 215000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 215100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 215200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 215300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 215400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 215500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 215600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 215700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 215800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 215900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 216000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 216100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 216200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 216300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 216400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 216500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 216600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 216700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 216800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 216900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 217000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 217100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 217200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 217300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 217400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 217500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 217600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0478, - "step": 217700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0482, - "step": 217800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0479, - "step": 217900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 218000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 218100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 218200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 218300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 218400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 218500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 218600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 218700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 218800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 218900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 219000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 219100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 219200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 219300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 219400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 219500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 219600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 219700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 219800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 219900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 220000 - }, - { - "epoch": 1.0, - "eval_loss": 0.03997802734375, - "eval_runtime": 3291.9413, - "eval_samples_per_second": 341.659, - "eval_steps_per_second": 21.354, - "step": 220000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 220100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 220200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 220300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 220400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0491, - "step": 220500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 220600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 220700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 220800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 220900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 221000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 221100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 221200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 221300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 221400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 221500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 221600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 221700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 221800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 221900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 222000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 222100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 222200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 222300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 222400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 222500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 222600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 222700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 222800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0488, - "step": 222900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 223000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 223100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 223200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 223300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 223400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 223500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 223600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 223700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 223800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 223900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 224000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 224100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 224200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 224300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 224400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 224500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0498, - "step": 224600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 224700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 224800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 224900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 225000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 225100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0504, - "step": 225200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 225300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 225400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 225500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 225600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 225700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 225800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 225900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 226000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 226100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 226200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 226300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 226400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 226500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 226600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 226700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 226800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 226900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 227000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 227100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 227200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 227300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 227400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 227500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.049, - "step": 227600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 227700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 227800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 227900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 228000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 228100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 228200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 228300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 228400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 228500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 228600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 228700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 228800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 228900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 229000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 229100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0486, - "step": 229200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 229300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 229400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 229500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 229600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 229700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 229800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 229900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 230000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 230100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 230200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 230300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 230400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 230500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 230600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 230700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0489, - "step": 230800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 230900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 231000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 231100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 231200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 231300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 231400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 231500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 231600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 231700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 231800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 231900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 232000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 232100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 232200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 232300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 232400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 232500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 232600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 232700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 232800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0466, - "step": 232900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0492, - "step": 233000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0485, - "step": 233100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 233200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 233300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 233400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 233500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 233600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 233700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 233800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 233900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 234000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 234100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 234200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 234300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0483, - "step": 234400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 234500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 234600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 234700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 234800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 234900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 235000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 235100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 235200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 235300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 235400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 235500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 235600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 235700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 235800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 235900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 236000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 236100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 236200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 236300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 236400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 236500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 236600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 236700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 236800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 236900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 237000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 237100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0477, - "step": 237200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 237300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 237400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 237500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 237600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 237700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 237800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 237900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 238000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 238100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 238200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 238300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 238400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 238500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 238600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 238700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 238800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 238900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 239000 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 239100 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 239200 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 239300 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 239400 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 239500 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 239600 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 239700 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 239800 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 239900 - }, - { - "epoch": 1.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 240000 - }, - { - "epoch": 1.0, - "eval_loss": 0.0391845703125, - "eval_runtime": 3284.0325, - "eval_samples_per_second": 342.482, - "eval_steps_per_second": 21.405, - "step": 240000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 240100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 240200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 240300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 240400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 240500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 240600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 240700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 240800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0463, - "step": 240900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 241000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 241100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 241200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 241300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 241400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 241500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 241600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 241700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 241800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 241900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 242000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 242100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 242200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 242300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 242400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 242500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 242600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 242700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0465, - "step": 242800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 242900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 243000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0471, - "step": 243100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 243200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 243300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 243400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 243500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 243600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 243700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 243800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 243900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 244000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 244100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 244200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 244300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 244400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 244500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 244600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 244700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 244800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 244900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 245000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 245100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 245200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 245300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 245400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 245500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 245600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 245700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 245800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 245900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 246000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 246100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 246200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 246300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 246400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 246500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 246600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 246700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 246800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 246900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 247000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 247100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 247200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 247300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 247400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 247500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 247600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 247700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 247800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 247900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 248000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 248100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 248200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 248300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 248400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 248500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 248600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 248700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 248800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 248900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 249000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 249100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 249200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 249300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.047, - "step": 249400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 249500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 249600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 249700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 249800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 249900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 250000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 250100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0464, - "step": 250200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 250300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 250400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 250500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 250600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 250700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 250800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 250900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 251000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 251100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 251200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 251300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 251400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 251500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 251600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 251700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 251800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 251900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 252000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 252100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 252200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 252300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 252400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 252500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 252600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 252700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 252800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0462, - "step": 252900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 253000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 253100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 253200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 253300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 253400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 253500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 253600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 253700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 253800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 253900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 254000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 254100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 254200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 254300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 254400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 254500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0469, - "step": 254600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 254700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 254800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 254900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 255000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 255100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 255200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 255300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 255400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 255500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 255600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 255700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 255800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 255900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 256000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 256100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 256200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 256300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 256400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 256500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 256600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 256700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 256800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 256900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 257000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 257100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 257200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 257300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 257400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 257500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 257600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 257700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 257800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 257900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 258000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 258100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 258200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 258300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 258400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 258500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 258600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 258700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0456, - "step": 258800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 258900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 259000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 259100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 259200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 259300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 259400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 259500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 259600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 259700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 259800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 259900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 260000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03851318359375, - "eval_runtime": 3581.0286, - "eval_samples_per_second": 314.078, - "eval_steps_per_second": 19.63, - "step": 260000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 260100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 260200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 260300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 260400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 260500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 260600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 260700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 260800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 260900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 261000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 261100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 261200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 261300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 261400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 261500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 261600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 261700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 261800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 261900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 262000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 262100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0474, - "step": 262200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 262300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 262400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 262500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 262600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 262700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 262800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 262900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 263000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 263100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 263200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 263300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 263400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 263500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 263600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 263700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 263800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 263900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 264000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 264100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 264200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 264300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 264400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 264500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 264600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 264700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 264800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 264900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0452, - "step": 265000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 265100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 265200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 265300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 265400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 265500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 265600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 265700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 265800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 265900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0458, - "step": 266000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 266100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 266200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 266300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 266400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 266500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 266600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 266700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 266800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 266900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 267000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 267100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 267200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 267300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 267400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 267500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 267600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 267700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 267800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 267900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 268000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 268100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 268200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 268300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 268400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 268500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 268600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 268700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 268800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 268900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 269000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 269100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 269200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 269300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 269400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 269500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 269600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 269700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 269800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 269900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 270000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 270100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 270200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 270300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 270400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 270500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 270600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 270700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 270800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 270900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 271000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 271100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 271200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 271300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 271400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 271500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 271600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 271700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 271800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 271900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 272000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 272100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 272200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 272300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 272400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 272500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 272600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 272700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 272800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 272900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 273000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 273100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 273200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 273300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 273400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 273500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 273600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 273700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 273800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 273900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 274000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 274100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 274200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 274300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 274400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 274500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 274600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 274700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0449, - "step": 274800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 274900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 275000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 275100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 275200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 275300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 275400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 275500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.046, - "step": 275600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 275700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 275800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 275900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 276000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 276100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 276200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 276300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 276400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 276500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 276600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 276700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 276800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 276900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 277000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 277100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 277200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 277300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 277400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 277500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 277600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 277700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 277800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 277900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 278000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 278100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 278200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 278300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 278400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 278500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 278600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 278700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 278800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 278900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 279000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 279100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 279200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 279300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 279400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 279500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 279600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 279700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0455, - "step": 279800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 279900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 280000 - }, - { - "epoch": 0.0, - "eval_loss": 0.036956787109375, - "eval_runtime": 3481.1002, - "eval_samples_per_second": 323.094, - "eval_steps_per_second": 20.194, - "step": 280000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 280100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 280200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 280300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 280400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 280500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 280600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 280700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 280800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 280900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 281000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 281100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 281200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 281300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 281400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 281500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 281600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 281700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 281800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 281900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 282000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 282100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 282200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 282300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 282400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 282500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 282600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 282700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 282800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 282900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 283000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 283100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 283200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 283300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 283400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 283500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 283600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 283700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 283800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 283900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 284000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 284100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.045, - "step": 284200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 284300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 284400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 284500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 284600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 284700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 284800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 284900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 285000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0446, - "step": 285100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0442, - "step": 285200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 285300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 285400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 285500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 285600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 285700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 285800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 285900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 286000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 286100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 286200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 286300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 286400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 286500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 286600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 286700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 286800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 286900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 287000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 287100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 287200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 287300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 287400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 287500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 287600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 287700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 287800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 287900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 288000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 288100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 288200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 288300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 288400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 288500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 288600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 288700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0453, - "step": 288800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 288900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 289000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0467, - "step": 289100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 289200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 289300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 289400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 289500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 289600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 289700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 289800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 289900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 290000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 290100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 290200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 290300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 290400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 290500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 290600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 290700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 290800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 290900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 291000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 291100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 291200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 291300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 291400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 291500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 291600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 291700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 291800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 291900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 292000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 292100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 292200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 292300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 292400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 292500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 292600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 292700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 292800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 292900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 293000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 293100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 293200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 293300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 293400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 293500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 293600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 293700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 293800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 293900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 294000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 294100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 294200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 294300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 294400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 294500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 294600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 294700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 294800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 294900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 295000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 295100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 295200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 295300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 295400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 295500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 295600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 295700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 295800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 295900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 296000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 296100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 296200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 296300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 296400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 296500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 296600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 296700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 296800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0447, - "step": 296900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 297000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 297100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 297200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 297300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 297400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 297500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 297600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 297700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 297800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 297900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 298000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 298100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 298200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 298300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 298400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 298500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 298600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 298700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 298800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 298900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 299000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 299100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 299200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 299300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 299400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 299500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 299600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 299700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 299800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 299900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 300000 - }, - { - "epoch": 0.0, - "eval_loss": 0.037200927734375, - "eval_runtime": 3479.7507, - "eval_samples_per_second": 323.219, - "eval_steps_per_second": 20.201, - "step": 300000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 300100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 300200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 300300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 300400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 300500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 300600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 300700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 300800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 300900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 301000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 301100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 301200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 301300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 301400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 301500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 301600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 301700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 301800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 301900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 302000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 302100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 302200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 302300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 302400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 302500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 302600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 302700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 302800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 302900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 303000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 303100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 303200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 303300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 303400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 303500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 303600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 303700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 303800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 303900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 304000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 304100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 304200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 304300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 304400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0433, - "step": 304500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 304600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 304700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 304800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 304900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 305000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 305100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 305200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 305300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 305400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 305500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 305600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 305700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 305800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 305900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 306000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 306100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 306200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 306300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 306400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 306500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 306600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 306700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 306800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 306900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 307000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 307100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 307200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 307300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 307400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 307500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 307600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 307700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 307800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 307900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 308000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 308100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 308200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 308300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 308400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 308500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 308600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 308700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 308800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 308900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 309000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 309100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 309200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 309300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 309400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 309500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 309600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 309700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 309800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 309900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 310000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 310100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 310200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 310300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 310400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 310500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 310600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 310700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 310800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0443, - "step": 310900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 311000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 311100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 311200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 311300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 311400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 311500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 311600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 311700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 311800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 311900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 312000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 312100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 312200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 312300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 312400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 312500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 312600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 312700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 312800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 312900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 313000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 313100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 313200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 313300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 313400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 313500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 313600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 313700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 313800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 313900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 314000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 314100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 314200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 314300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 314400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 314500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 314600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 314700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 314800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 314900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 315000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 315100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 315200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 315300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 315400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 315500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 315600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 315700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 315800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 315900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 316000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 316100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 316200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 316300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 316400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 316500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 316600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 316700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 316800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 316900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 317000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 317100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 317200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 317300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 317400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 317500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 317600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 317700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 317800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 317900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 318000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 318100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 318200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 318300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 318400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 318500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 318600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 318700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 318800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 318900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 319000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0448, - "step": 319100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 319200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 319300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 319400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 319500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 319600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 319700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0432, - "step": 319800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 319900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 320000 - }, - { - "epoch": 0.0, - "eval_loss": 0.036041259765625, - "eval_runtime": 3349.372, - "eval_samples_per_second": 335.801, - "eval_steps_per_second": 20.988, - "step": 320000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 320100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 320200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 320300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 320400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 320500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 320600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 320700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 320800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 320900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 321000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 321100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 321200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 321300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 321400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 321500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 321600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 321700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 321800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 321900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 322000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 322100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 322200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 322300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 322400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 322500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 322600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 322700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 322800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 322900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 323000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 323100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 323200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 323300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 323400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 323500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 323600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 323700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 323800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 323900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 324000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 324100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 324200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 324300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 324400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 324500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 324600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 324700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 324800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 324900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 325000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 325100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0437, - "step": 325200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 325300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 325400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 325500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 325600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 325700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 325800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 325900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 326000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 326100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 326200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 326300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 326400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 326500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 326600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 326700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 326800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 326900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 327000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 327100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 327200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 327300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 327400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 327500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 327600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 327700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 327800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 327900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 328000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 328100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 328200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 328300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 328400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 328500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 328600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 328700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 328800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 328900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 329000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 329100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 329200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 329300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 329400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 329500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 329600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 329700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 329800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 329900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 330000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 330100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0434, - "step": 330200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 330300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 330400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 330500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 330600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 330700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 330800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 330900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 331000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 331100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 331200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 331300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 331400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 331500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 331600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 331700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 331800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 331900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 332000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 332100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 332200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 332300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 332400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 332500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 332600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 332700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 332800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 332900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 333000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 333100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 333200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 333300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 333400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 333500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 333600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 333700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 333800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 333900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 334000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 334100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 334200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 334300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 334400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 334500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 334600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 334700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 334800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 334900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 335000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 335100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 335200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 335300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 335400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 335500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 335600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 335700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 335800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 335900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 336000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 336100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 336200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 336300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 336400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 336500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 336600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 336700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 336800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 336900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 337000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 337100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 337200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 337300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 337400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 337500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 337600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 337700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 337800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 337900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 338000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 338100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 338200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 338300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 338400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 338500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 338600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 338700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 338800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0429, - "step": 338900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 339000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 339100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 339200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 339300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 339400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 339500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 339600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 339700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 339800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 339900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 340000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0361328125, - "eval_runtime": 3382.3623, - "eval_samples_per_second": 332.526, - "eval_steps_per_second": 20.783, - "step": 340000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 340100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 340200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 340300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 340400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 340500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 340600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 340700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 340800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 340900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 341000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 341100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 341200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 341300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 341400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 341500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 341600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 341700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 341800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 341900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 342000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 342100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 342200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 342300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 342400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 342500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 342600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 342700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 342800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 342900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 343000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 343100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 343200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 343300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 343400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 343500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 343600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 343700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 343800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 343900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 344000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 344100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 344200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 344300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 344400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 344500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 344600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 344700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 344800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 344900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 345000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 345100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 345200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 345300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 345400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 345500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 345600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 345700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 345800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 345900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 346000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 346100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 346200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 346300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 346400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 346500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 346600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 346700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 346800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 346900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 347000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 347100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 347200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 347300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 347400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 347500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 347600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 347700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 347800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 347900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 348000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 348100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 348200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 348300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 348400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 348500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 348600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 348700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 348800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 348900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 349000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 349100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0438, - "step": 349200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 349300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 349400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 349500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 349600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 349700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 349800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 349900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 350000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 350100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 350200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 350300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 350400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 350500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 350600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 350700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 350800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 350900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 351000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 351100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 351200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 351300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 351400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 351500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 351600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 351700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 351800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 351900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 352000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 352100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 352200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 352300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 352400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 352500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 352600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 352700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 352800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 352900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 353000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 353100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 353200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 353300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 353400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 353500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 353600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 353700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 353800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 353900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 354000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 354100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 354200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 354300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 354400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 354500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 354600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 354700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 354800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 354900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 355000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 355100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 355200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 355300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 355400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 355500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 355600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 355700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 355800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 355900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 356000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 356100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 356200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 356300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 356400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 356500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 356600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 356700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 356800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 356900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 357000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 357100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 357200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 357300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 357400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 357500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 357600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 357700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 357800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 357900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 358000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 358100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 358200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 358300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 358400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 358500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 358600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 358700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 358800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 358900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 359000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 359100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 359200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 359300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 359400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 359500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 359600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 359700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 359800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 359900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 360000 - }, - { - "epoch": 0.0, - "eval_loss": 0.035430908203125, - "eval_runtime": 3371.4761, - "eval_samples_per_second": 333.6, - "eval_steps_per_second": 20.85, - "step": 360000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 360100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 360200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 360300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 360400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 360500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 360600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 360700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 360800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 360900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 361000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 361100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 361200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 361300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 361400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 361500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 361600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 361700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 361800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 361900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 362000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 362100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 362200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 362300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 362400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 362500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 362600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 362700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 362800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 362900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 363000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 363100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 363200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.042, - "step": 363300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 363400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 363500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 363600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 363700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 363800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 363900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 364000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 364100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 364200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 364300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.043, - "step": 364400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 364500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 364600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 364700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 364800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 364900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 365000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 365100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 365200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 365300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 365400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 365500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 365600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 365700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 365800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 365900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 366000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 366100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 366200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 366300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 366400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 366500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 366600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 366700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 366800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 366900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0428, - "step": 367000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 367100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 367200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 367300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 367400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 367500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0422, - "step": 367600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 367700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 367800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 367900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 368000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 368100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 368200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 368300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 368400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 368500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 368600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 368700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 368800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 368900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 369000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 369100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 369200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 369300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 369400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 369500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 369600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 369700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 369800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 369900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 370000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 370100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 370200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 370300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 370400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 370500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 370600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 370700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 370800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 370900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 371000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 371100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 371200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 371300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 371400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 371500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 371600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 371700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 371800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 371900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 372000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 372100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 372200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 372300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 372400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 372500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 372600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 372700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 372800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 372900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 373000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 373100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 373200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 373300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 373400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 373500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 373600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 373700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 373800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 373900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 374000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 374100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 374200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0435, - "step": 374300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 374400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 374500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 374600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 374700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 374800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 374900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 375000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 375100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 375200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 375300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 375400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 375500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 375600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 375700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0425, - "step": 375800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 375900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 376000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 376100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 376200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 376300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 376400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 376500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 376600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 376700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 376800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0441, - "step": 376900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 377000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 377100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 377200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 377300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 377400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 377500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 377600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 377700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 377800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 377900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 378000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 378100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 378200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 378300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 378400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 378500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 378600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 378700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 378800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 378900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 379000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 379100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 379200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 379300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 379400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 379500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 379600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 379700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 379800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 379900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 380000 - }, - { - "epoch": 0.0, - "eval_loss": 0.035675048828125, - "eval_runtime": 3845.7026, - "eval_samples_per_second": 292.462, - "eval_steps_per_second": 18.279, - "step": 380000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 380100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 380200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 380300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 380400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 380500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 380600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 380700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 380800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 380900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 381000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0421, - "step": 381100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 381200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 381300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 381400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 381500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 381600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 381700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 381800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 381900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 382000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 382100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0427, - "step": 382200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 382300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 382400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 382500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 382600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 382700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 382800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 382900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 383000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 383100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0431, - "step": 383200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 383300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 383400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 383500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 383600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 383700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 383800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 383900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 384000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 384100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 384200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 384300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 384400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 384500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 384600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 384700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 384800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 384900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 385000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 385100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 385200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 385300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 385400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 385500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 385600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 385700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 385800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 385900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 386000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 386100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 386200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 386300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 386400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 386500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 386600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 386700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 386800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 386900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 387000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 387100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 387200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 387300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 387400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 387500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 387600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 387700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 387800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 387900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 388000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 388100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 388200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 388300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 388400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 388500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 388600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 388700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 388800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 388900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 389000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 389100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 389200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 389300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 389400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 389500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 389600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 389700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 389800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 389900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 390000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 390100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 390200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 390300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 390400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 390500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 390600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 390700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 390800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 390900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 391000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 391100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 391200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 391300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 391400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 391500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 391600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 391700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 391800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 391900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 392000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 392100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 392200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 392300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 392400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 392500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 392600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 392700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 392800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 392900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 393000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 393100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 393200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 393300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 393400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 393500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 393600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 393700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 393800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 393900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 394000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 394100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 394200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 394300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 394400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 394500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 394600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0426, - "step": 394700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 394800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 394900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 395000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 395100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 395200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 395300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 395400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 395500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 395600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 395700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 395800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 395900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 396000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 396100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 396200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 396300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 396400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 396500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 396600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 396700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 396800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 396900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 397000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 397100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 397200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 397300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 397400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 397500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 397600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 397700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 397800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 397900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 398000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 398100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 398200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 398300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 398400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 398500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 398600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 398700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 398800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 398900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 399000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 399100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 399200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 399300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 399400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 399500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 399600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 399700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 399800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 399900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 400000 - }, - { - "epoch": 0.0, - "eval_loss": 0.034576416015625, - "eval_runtime": 3914.1663, - "eval_samples_per_second": 287.347, - "eval_steps_per_second": 17.959, - "step": 400000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 400100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 400200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 400300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 400400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 400500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 400600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 400700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 400800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 400900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 401000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 401100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 401200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 401300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 401400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 401500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 401600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 401700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 401800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 401900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 402000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 402100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 402200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 402300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 402400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 402500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 402600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 402700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 402800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 402900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 403000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 403100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 403200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 403300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 403400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 403500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 403600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 403700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 403800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 403900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 404000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 404100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 404200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 404300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 404400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 404500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 404600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 404700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 404800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 404900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 405000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 405100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 405200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 405300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 405400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 405500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 405600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 405700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 405800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 405900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 406000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 406100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 406200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 406300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 406400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 406500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 406600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 406700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 406800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 406900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 407000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 407100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 407200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 407300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 407400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 407500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 407600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 407700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 407800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 407900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 408000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 408100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 408200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 408300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 408400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 408500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 408600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 408700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 408800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 408900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 409000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 409100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 409200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 409300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 409400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 409500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 409600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 409700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 409800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 409900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 410000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 410100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 410200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 410300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 410400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 410500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 410600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 410700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 410800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 410900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 411000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 411100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 411200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 411300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 411400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 411500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 411600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 411700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 411800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 411900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 412000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 412100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 412200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 412300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 412400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 412500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 412600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 412700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 412800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 412900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 413000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 413100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 413200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 413300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 413400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 413500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 413600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 413700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 413800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 413900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 414000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 414100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 414200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 414300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 414400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 414500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 414600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 414700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 414800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 414900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 415000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 415100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 415200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 415300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 415400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 415500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 415600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 415700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 415800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 415900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 416000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 416100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 416200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 416300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 416400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0417, - "step": 416500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 416600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 416700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 416800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 416900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 417000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 417100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 417200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 417300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 417400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 417500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 417600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 417700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 417800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 417900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 418000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 418100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 418200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 418300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 418400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 418500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 418600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 418700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 418800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 418900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 419000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 419100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 419200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 419300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 419400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 419500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 419600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 419700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 419800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 419900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 420000 - }, - { - "epoch": 0.0, - "eval_loss": 0.033843994140625, - "eval_runtime": 3662.1217, - "eval_samples_per_second": 307.123, - "eval_steps_per_second": 19.195, - "step": 420000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 420100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 420200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 420300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 420400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 420500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 420600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 420700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 420800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 420900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 421000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 421100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 421200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 421300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 421400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 421500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 421600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 421700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 421800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 421900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 422000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 422100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 422200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 422300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 422400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 422500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 422600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 422700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 422800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 422900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 423000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 423100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 423200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 423300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 423400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 423500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 423600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 423700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 423800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 423900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 424000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 424100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 424200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 424300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 424400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 424500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 424600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 424700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 424800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 424900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 425000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 425100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 425200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 425300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 425400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 425500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 425600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 425700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 425800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 425900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 426000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 426100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 426200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 426300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 426400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 426500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 426600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 426700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 426800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 426900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 427000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 427100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 427200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 427300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 427400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 427500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 427600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 427700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 427800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 427900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 428000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 428100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 428200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 428300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 428400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 428500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 428600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 428700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 428800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 428900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 429000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 429100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 429200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 429300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 429400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 429500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 429600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 429700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 429800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 429900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 430000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 430100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 430200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 430300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 430400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 430500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 430600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 430700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 430800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 430900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 431000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 431100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 431200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 431300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 431400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 431500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 431600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 431700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 431800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 431900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 432000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 432100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 432200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 432300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 432400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 432500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 432600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 432700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 432800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 432900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 433000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 433100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 433200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 433300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 433400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 433500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 433600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 433700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 433800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 433900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 434000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 434100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 434200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 434300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 434400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 434500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 434600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 434700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 434800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 434900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 435000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 435100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 435200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 435300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 435400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 435500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 435600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 435700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 435800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 435900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 436000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 436100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 436200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 436300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 436400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 436500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 436600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 436700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 436800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 436900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 437000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 437100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 437200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 437300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 437400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 437500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 437600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 437700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 437800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 437900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 438000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 438100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 438200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 438300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 438400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 438500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 438600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 438700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 438800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 438900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 439000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 439100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 439200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 439300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 439400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 439500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 439600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 439700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 439800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 439900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 440000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03411865234375, - "eval_runtime": 4314.8615, - "eval_samples_per_second": 260.663, - "eval_steps_per_second": 16.292, - "step": 440000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 440100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 440200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 440300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 440400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 440500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 440600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 440700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 440800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 440900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 441000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 441100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 441200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 441300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 441400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 441500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 441600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 441700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 441800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 441900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 442000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 442100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 442200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 442300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 442400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 442500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 442600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 442700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 442800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 442900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 443000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 443100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 443200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 443300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 443400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 443500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 443600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 443700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 443800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 443900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 444000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 444100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 444200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 444300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 444400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 444500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 444600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 444700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 444800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 444900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 445000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 445100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 445200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 445300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 445400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 445500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 445600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 445700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 445800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 445900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 446000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 446100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 446200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 446300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 446400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 446500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 446600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 446700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 446800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 446900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 447000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 447100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 447200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 447300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 447400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 447500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 447600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 447700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 447800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 447900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 448000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 448100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 448200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 448300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 448400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 448500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 448600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 448700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 448800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 448900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 449000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 449100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 449200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 449300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 449400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 449500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 449600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 449700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 449800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 449900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 450000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 450100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 450200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 450300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 450400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 450500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 450600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 450700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0406, - "step": 450800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 450900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 451000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 451100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 451200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 451300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 451400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 451500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 451600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 451700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 451800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 451900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 452000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 452100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 452200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 452300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 452400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 452500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 452600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 452700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 452800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 452900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 453000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 453100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 453200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 453300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 453400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 453500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 453600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 453700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 453800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 453900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 454000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 454100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 454200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 454300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 454400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 454500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 454600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 454700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 454800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 454900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 455000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 455100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 455200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 455300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 455400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 455500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 455600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 455700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 455800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 455900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 456000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 456100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 456200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 456300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 456400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 456500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 456600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 456700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 456800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 456900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 457000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 457100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 457200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 457300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 457400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 457500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 457600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 457700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 457800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 457900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 458000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 458100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 458200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 458300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 458400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 458500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 458600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 458700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 458800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 458900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 459000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 459100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 459200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 459300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 459400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 459500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 459600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 459700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 459800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 459900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 460000 - }, - { - "epoch": 0.0, - "eval_loss": 0.033843994140625, - "eval_runtime": 4130.1582, - "eval_samples_per_second": 272.32, - "eval_steps_per_second": 17.02, - "step": 460000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 460100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 460200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 460300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 460400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 460500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 460600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 460700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 460800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 460900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 461000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 461100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 461200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 461300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 461400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 461500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 461600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 461700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 461800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 461900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 462000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 462100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 462200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 462300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 462400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 462500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 462600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 462700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 462800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 462900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 463000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 463100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 463200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 463300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 463400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 463500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 463600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 463700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 463800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 463900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 464000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 464100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 464200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 464300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 464400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 464500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 464600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 464700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 464800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 464900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 465000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 465100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 465200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 465300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 465400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 465500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 465600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 465700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 465800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 465900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 466000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 466100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 466200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 466300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 466400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 466500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 466600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 466700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 466800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 466900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 467000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 467100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 467200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 467300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 467400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 467500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 467600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 467700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 467800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 467900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 468000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 468100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 468200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 468300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 468400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 468500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 468600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 468700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 468800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 468900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 469000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 469100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 469200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 469300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 469400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 469500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 469600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 469700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 469800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 469900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 470000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 470100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 470200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 470300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 470400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 470500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 470600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 470700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 470800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 470900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 471000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 471100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 471200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 471300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 471400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 471500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 471600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 471700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 471800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 471900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 472000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 472100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 472200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 472300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 472400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 472500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 472600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0408, - "step": 472700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 472800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 472900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 473000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 473100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 473200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 473300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 473400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 473500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 473600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 473700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 473800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 473900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 474000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 474100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 474200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 474300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 474400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 474500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 474600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 474700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 474800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 474900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 475000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 475100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 475200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 475300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 475400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 475500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 475600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 475700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 475800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 475900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 476000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 476100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 476200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 476300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 476400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 476500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 476600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 476700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 476800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 476900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 477000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 477100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 477200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 477300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 477400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 477500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 477600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 477700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 477800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0415, - "step": 477900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 478000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 478100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 478200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 478300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 478400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 478500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 478600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 478700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 478800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 478900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 479000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 479100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 479200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 479300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 479400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 479500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 479600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 479700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 479800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 479900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 480000 - }, - { - "epoch": 0.0, - "eval_loss": 0.033477783203125, - "eval_runtime": 2992.0731, - "eval_samples_per_second": 375.901, - "eval_steps_per_second": 23.494, - "step": 480000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 480100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 480200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 480300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 480400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 480500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 480600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 480700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 480800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 480900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 481000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 481100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 481200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 481300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 481400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 481500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 481600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 481700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 481800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 481900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 482000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 482100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 482200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 482300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 482400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 482500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 482600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 482700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 482800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 482900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 483000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 483100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 483200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 483300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 483400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 483500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 483600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 483700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 483800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 483900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 484000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 484100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 484200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 484300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 484400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 484500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 484600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 484700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 484800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 484900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 485000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 485100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 485200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 485300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 485400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 485500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 485600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 485700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 485800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 485900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 486000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 486100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 486200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 486300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.041, - "step": 486400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 486500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 486600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 486700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 486800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 486900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 487000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 487100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 487200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 487300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 487400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 487500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 487600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 487700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 487800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 487900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 488000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 488100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 488200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 488300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 488400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 488500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 488600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 488700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 488800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 488900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 489000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 489100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 489200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 489300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 489400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 489500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 489600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 489700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 489800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 489900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 490000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 490100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 490200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 490300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 490400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 490500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 490600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 490700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 490800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 490900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 491000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 491100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 491200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 491300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 491400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 491500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 491600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 491700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 491800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 491900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 492000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 492100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 492200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 492300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 492400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 492500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 492600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 492700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 492800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 492900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 493000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 493100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 493200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 493300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 493400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 493500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 493600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 493700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 493800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 493900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 494000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 494100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 494200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 494300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 494400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 494500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 494600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 494700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 494800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 494900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 495000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 495100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 495200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 495300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 495400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 495500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 495600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 495700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 495800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 495900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 496000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 496100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 496200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 496300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 496400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 496500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 496600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 496700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 496800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 496900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 497000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 497100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 497200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 497300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 497400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 497500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 497600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 497700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 497800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 497900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 498000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 498100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 498200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 498300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 498400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 498500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 498600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 498700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 498800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 498900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 499000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 499100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 499200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 499300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 499400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 499500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 499600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 499700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 499800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 499900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 500000 - }, - { - "epoch": 0.0, - "eval_loss": 0.033447265625, - "eval_runtime": 3057.7078, - "eval_samples_per_second": 367.832, - "eval_steps_per_second": 22.99, - "step": 500000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 500100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 500200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 500300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 500400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 500500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 500600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 500700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 500800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 500900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 501000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 501100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 501200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 501300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 501400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 501500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 501600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 501700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 501800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 501900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 502000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 502100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 502200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 502300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 502400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 502500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 502600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 502700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 502800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 502900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 503000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 503100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 503200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 503300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 503400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 503500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 503600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 503700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 503800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 503900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 504000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 504100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 504200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 504300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 504400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 504500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 504600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 504700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 504800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 504900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 505000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 505100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 505200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 505300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 505400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 505500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 505600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 505700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 505800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 505900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 506000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 506100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 506200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 506300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 506400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 506500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 506600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 506700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 506800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 506900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 507000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 507100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 507200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 507300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 507400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 507500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 507600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 507700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 507800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 507900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 508000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 508100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 508200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 508300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 508400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 508500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 508600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 508700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 508800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 508900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 509000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 509100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 509200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 509300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 509400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 509500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 509600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 509700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 509800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 509900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 510000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 510100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 510200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 510300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 510400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 510500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 510600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 510700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 510800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 510900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 511000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 511100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 511200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 511300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 511400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 511500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 511600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 511700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 511800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 511900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 512000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 512100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 512200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 512300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 512400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 512500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 512600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 512700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 512800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 512900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 513000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 513100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 513200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 513300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 513400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 513500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 513600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 513700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 513800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 513900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 514000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 514100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 514200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 514300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 514400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 514500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 514600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 514700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 514800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 514900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 515000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 515100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 515200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 515300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 515400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 515500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 515600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 515700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 515800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 515900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 516000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 516100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 516200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 516300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 516400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 516500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 516600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 516700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 516800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 516900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 517000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 517100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 517200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 517300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 517400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 517500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 517600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 517700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 517800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 517900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 518000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 518100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 518200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 518300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 518400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 518500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0419, - "step": 518600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 518700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 518800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 518900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 519000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 519100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 519200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 519300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 519400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 519500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 519600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 519700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 519800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 519900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 520000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032989501953125, - "eval_runtime": 3122.3006, - "eval_samples_per_second": 360.223, - "eval_steps_per_second": 22.514, - "step": 520000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 520100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 520200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 520300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 520400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 520500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 520600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 520700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 520800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 520900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 521000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 521100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 521200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 521300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 521400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 521500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 521600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 521700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 521800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 521900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 522000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 522100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 522200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 522300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 522400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 522500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 522600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 522700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 522800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 522900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 523000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 523100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 523200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 523300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 523400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 523500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 523600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 523700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 523800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 523900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 524000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 524100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 524200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 524300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 524400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 524500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 524600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 524700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 524800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 524900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 525000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 525100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 525200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 525300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 525400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 525500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 525600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 525700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 525800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 525900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 526000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 526100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 526200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 526300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 526400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 526500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 526600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 526700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 526800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 526900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 527000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 527100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 527200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 527300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 527400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 527500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 527600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 527700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 527800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 527900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 528000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 528100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 528200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 528300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 528400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 528500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 528600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0424, - "step": 528700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 528800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 528900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 529000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 529100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 529200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 529300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 529400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 529500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 529600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 529700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 529800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 529900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 530000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 530100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 530200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 530300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 530400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 530500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 530600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 530700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 530800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 530900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 531000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 531100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 531200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 531300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 531400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 531500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 531600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 531700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 531800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 531900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 532000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 532100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 532200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 532300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 532400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 532500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 532600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 532700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 532800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 532900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 533000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 533100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 533200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 533300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 533400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 533500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 533600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 533700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 533800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 533900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 534000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 534100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 534200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 534300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 534400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 534500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 534600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 534700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 534800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 534900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 535000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 535100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 535200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 535300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 535400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 535500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 535600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 535700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 535800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 535900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 536000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 536100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 536200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 536300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 536400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 536500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 536600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 536700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 536800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 536900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 537000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 537100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 537200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 537300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 537400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 537500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 537600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 537700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 537800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 537900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 538000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 538100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 538200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 538300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 538400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 538500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 538600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 538700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 538800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 538900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 539000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 539100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 539200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 539300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 539400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 539500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 539600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 539700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 539800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 539900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 540000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032684326171875, - "eval_runtime": 3257.3593, - "eval_samples_per_second": 345.287, - "eval_steps_per_second": 21.581, - "step": 540000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 540100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 540200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 540300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 540400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 540500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 540600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 540700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 540800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 540900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 541000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 541100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 541200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 541300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 541400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 541500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 541600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 541700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 541800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 541900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 542000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 542100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 542200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 542300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 542400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 542500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 542600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 542700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 542800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 542900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 543000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 543100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 543200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 543300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 543400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 543500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 543600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 543700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 543800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 543900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 544000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 544100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 544200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 544300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 544400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 544500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 544600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 544700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 544800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 544900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 545000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 545100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 545200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 545300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 545400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 545500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 545600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 545700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 545800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 545900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 546000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 546100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 546200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 546300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 546400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 546500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 546600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 546700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 546800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 546900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 547000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 547100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 547200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 547300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 547400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 547500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 547600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 547700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 547800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 547900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 548000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 548100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 548200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 548300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 548400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 548500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 548600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 548700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 548800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 548900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 549000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 549100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 549200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 549300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 549400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 549500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 549600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 549700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 549800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 549900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 550000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 550100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 550200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 550300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 550400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 550500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 550600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 550700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 550800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 550900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 551000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 551100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 551200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 551300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 551400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 551500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 551600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 551700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 551800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 551900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 552000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 552100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 552200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 552300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 552400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 552500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 552600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 552700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 552800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 552900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 553000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 553100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 553200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 553300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 553400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 553500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 553600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 553700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 553800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 553900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 554000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 554100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 554200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 554300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 554400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 554500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 554600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 554700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 554800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 554900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 555000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 555100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 555200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 555300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 555400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 555500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 555600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 555700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 555800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 555900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 556000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 556100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 556200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 556300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 556400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 556500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 556600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 556700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 556800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 556900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 557000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 557100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 557200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 557300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 557400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 557500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 557600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 557700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 557800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 557900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 558000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 558100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 558200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 558300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 558400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 558500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 558600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 558700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 558800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 558900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 559000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 559100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 559200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 559300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 559400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 559500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 559600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 559700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 559800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 559900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 560000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032989501953125, - "eval_runtime": 3191.314, - "eval_samples_per_second": 352.433, - "eval_steps_per_second": 22.027, - "step": 560000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 560100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 560200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 560300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 560400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 560500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 560600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 560700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 560800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 560900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 561000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 561100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 561200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 561300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 561400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 561500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 561600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 561700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 561800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 561900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 562000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 562100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 562200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 562300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 562400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 562500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 562600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 562700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 562800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 562900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 563000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 563100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 563200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 563300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 563400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 563500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 563600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 563700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 563800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 563900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 564000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 564100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 564200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 564300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 564400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 564500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 564600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 564700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 564800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 564900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 565000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 565100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 565200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 565300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 565400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 565500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 565600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 565700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 565800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 565900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 566000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 566100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 566200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 566300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 566400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 566500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 566600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 566700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 566800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 566900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 567000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 567100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 567200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 567300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 567400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 567500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 567600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 567700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 567800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 567900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 568000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 568100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 568200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 568300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 568400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 568500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 568600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 568700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 568800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 568900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 569000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 569100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 569200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 569300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 569400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 569500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 569600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 569700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 569800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 569900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 570000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 570100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 570200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 570300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 570400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 570500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 570600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 570700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 570800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 570900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 571000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 571100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 571200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 571300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 571400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 571500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 571600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 571700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 571800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 571900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 572000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 572100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 572200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 572300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 572400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 572500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 572600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 572700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 572800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 572900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 573000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 573100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 573200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 573300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 573400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 573500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 573600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 573700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 573800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 573900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 574000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 574100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 574200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 574300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 574400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 574500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 574600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 574700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 574800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 574900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 575000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 575100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 575200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 575300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 575400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 575500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 575600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 575700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 575800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 575900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 576000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 576100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 576200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 576300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 576400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 576500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 576600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 576700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 576800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 576900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 577000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 577100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 577200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 577300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 577400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 577500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 577600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 577700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 577800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 577900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 578000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 578100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 578200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 578300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 578400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 578500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 578600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 578700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 578800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 578900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 579000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 579100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 579200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 579300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 579400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 579500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 579600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 579700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 579800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 579900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 580000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032958984375, - "eval_runtime": 3246.6947, - "eval_samples_per_second": 346.421, - "eval_steps_per_second": 21.652, - "step": 580000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 580100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 580200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 580300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 580400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 580500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 580600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 580700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 580800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 580900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 581000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 581100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 581200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 581300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 581400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 581500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 581600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 581700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 581800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 581900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 582000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 582100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 582200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 582300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 582400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 582500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 582600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 582700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 582800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 582900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 583000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 583100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 583200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 583300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 583400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 583500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 583600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 583700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 583800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 583900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 584000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 584100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 584200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 584300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 584400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 584500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 584600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 584700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 584800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 584900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 585000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 585100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 585200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 585300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 585400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 585500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 585600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 585700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 585800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 585900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 586000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 586100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 586200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 586300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 586400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 586500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 586600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 586700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 586800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 586900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 587000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 587100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 587200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 587300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 587400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 587500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0395, - "step": 587600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 587700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 587800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 587900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 588000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 588100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 588200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 588300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 588400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 588500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 588600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 588700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 588800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 588900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 589000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 589100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 589200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 589300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 589400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 589500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 589600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 589700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 589800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 589900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 590000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 590100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 590200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 590300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 590400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 590500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 590600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 590700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 590800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 590900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 591000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 591100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 591200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 591300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 591400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 591500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 591600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 591700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 591800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 591900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 592000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 592100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 592200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 592300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 592400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 592500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 592600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 592700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 592800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 592900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 593000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 593100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 593200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 593300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 593400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 593500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 593600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 593700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 593800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 593900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 594000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 594100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 594200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 594300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 594400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 594500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 594600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 594700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 594800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 594900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 595000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 595100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 595200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 595300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 595400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 595500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 595600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 595700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 595800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 595900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 596000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 596100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 596200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 596300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 596400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 596500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 596600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 596700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 596800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 596900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 597000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 597100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 597200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 597300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 597400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 597500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 597600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 597700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 597800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 597900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 598000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 598100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 598200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 598300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 598400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 598500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 598600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 598700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 598800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 598900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 599000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 599100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 599200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 599300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 599400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 599500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 599600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 599700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 599800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 599900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 600000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032318115234375, - "eval_runtime": 3276.9359, - "eval_samples_per_second": 343.224, - "eval_steps_per_second": 21.452, - "step": 600000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 600100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 600200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 600300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 600400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 600500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 600600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 600700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 600800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 600900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 601000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 601100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 601200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 601300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 601400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 601500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 601600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 601700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 601800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 601900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 602000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 602100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 602200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 602300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 602400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 602500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 602600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 602700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 602800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 602900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 603000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 603100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 603200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 603300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 603400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 603500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 603600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 603700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 603800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 603900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 604000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 604100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 604200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 604300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 604400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 604500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 604600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 604700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 604800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 604900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 605000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 605100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 605200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 605300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 605400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 605500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 605600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 605700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 605800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 605900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 606000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 606100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 606200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 606300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 606400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 606500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 606600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 606700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 606800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 606900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 607000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 607100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 607200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 607300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 607400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 607500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 607600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 607700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 607800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 607900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 608000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 608100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 608200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 608300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 608400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 608500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 608600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 608700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 608800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 608900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 609000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 609100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 609200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 609300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 609400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 609500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 609600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 609700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 609800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 609900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 610000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 610100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 610200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 610300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 610400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 610500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 610600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 610700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 610800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 610900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 611000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 611100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 611200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 611300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 611400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 611500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 611600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 611700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 611800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 611900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 612000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 612100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 612200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 612300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 612400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 612500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 612600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 612700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 612800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 612900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 613000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 613100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 613200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 613300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 613400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 613500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 613600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 613700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 613800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 613900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 614000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 614100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 614200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 614300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 614400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 614500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 614600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 614700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 614800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 614900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 615000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 615100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 615200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 615300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 615400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 615500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 615600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 615700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 615800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 615900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 616000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 616100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 616200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 616300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 616400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 616500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 616600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 616700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 616800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 616900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 617000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 617100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 617200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 617300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 617400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 617500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 617600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 617700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 617800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 617900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 618000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 618100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 618200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 618300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 618400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 618500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 618600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 618700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 618800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 618900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 619000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 619100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 619200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 619300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 619400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 619500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 619600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 619700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 619800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 619900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 620000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032318115234375, - "eval_runtime": 3169.2624, - "eval_samples_per_second": 354.885, - "eval_steps_per_second": 22.181, - "step": 620000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 620100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 620200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 620300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 620400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 620500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 620600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 620700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 620800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 620900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 621000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 621100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 621200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 621300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 621400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 621500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 621600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 621700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 621800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 621900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 622000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 622100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 622200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 622300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 622400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 622500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 622600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 622700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 622800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 622900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 623000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 623100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 623200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 623300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 623400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 623500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 623600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 623700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 623800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 623900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 624000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 624100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 624200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 624300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 624400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 624500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 624600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 624700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 624800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 624900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 625000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 625100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 625200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 625300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 625400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 625500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 625600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 625700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 625800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 625900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 626000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 626100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 626200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 626300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 626400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 626500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 626600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 626700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 626800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 626900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 627000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 627100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 627200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 627300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 627400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 627500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 627600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 627700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 627800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 627900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 628000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 628100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 628200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 628300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 628400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 628500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 628600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 628700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 628800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 628900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 629000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 629100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 629200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 629300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 629400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 629500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 629600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 629700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 629800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 629900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 630000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 630100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 630200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 630300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 630400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 630500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 630600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 630700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 630800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 630900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 631000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 631100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 631200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 631300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 631400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 631500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 631600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 631700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 631800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 631900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 632000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 632100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 632200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 632300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 632400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 632500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 632600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 632700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 632800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 632900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 633000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 633100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 633200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 633300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 633400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 633500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 633600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 633700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 633800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 633900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 634000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 634100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 634200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 634300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 634400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 634500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 634600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 634700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 634800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 634900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 635000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 635100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 635200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 635300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 635400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 635500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 635600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 635700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 635800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 635900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 636000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 636100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 636200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 636300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 636400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 636500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 636600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 636700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 636800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 636900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 637000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 637100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 637200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 637300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 637400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 637500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 637600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 637700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 637800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 637900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 638000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 638100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 638200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 638300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 638400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 638500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 638600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 638700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 638800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 638900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 639000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 639100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 639200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 639300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 639400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 639500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 639600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 639700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 639800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 639900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 640000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03216552734375, - "eval_runtime": 3040.9401, - "eval_samples_per_second": 369.86, - "eval_steps_per_second": 23.117, - "step": 640000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 640100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 640200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 640300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 640400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 640500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 640600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 640700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 640800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 640900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 641000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 641100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 641200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 641300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 641400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 641500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 641600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 641700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 641800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 641900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 642000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 642100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 642200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 642300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 642400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 642500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 642600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 642700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 642800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 642900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 643000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 643100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 643200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 643300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 643400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 643500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 643600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 643700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 643800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 643900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 644000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 644100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 644200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 644300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 644400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 644500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 644600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 644700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 644800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 644900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 645000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 645100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 645200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 645300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 645400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 645500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 645600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 645700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 645800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 645900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 646000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 646100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 646200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 646300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 646400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 646500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 646600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 646700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 646800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 646900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 647000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 647100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 647200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 647300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 647400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 647500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 647600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 647700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 647800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 647900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 648000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 648100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 648200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 648300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 648400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 648500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 648600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 648700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 648800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 648900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 649000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 649100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 649200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 649300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 649400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 649500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 649600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 649700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 649800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 649900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 650000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 650100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 650200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 650300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 650400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 650500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 650600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 650700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 650800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 650900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 651000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 651100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 651200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 651300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 651400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 651500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 651600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 651700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 651800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 651900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 652000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 652100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 652200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 652300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 652400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 652500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 652600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 652700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 652800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 652900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 653000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 653100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 653200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 653300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 653400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 653500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 653600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 653700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 653800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 653900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 654000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 654100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 654200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 654300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 654400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 654500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 654600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 654700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 654800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 654900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 655000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 655100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 655200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 655300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 655400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 655500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 655600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 655700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 655800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 655900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 656000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 656100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 656200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 656300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 656400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 656500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 656600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 656700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 656800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 656900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 657000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 657100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 657200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 657300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 657400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 657500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 657600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 657700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 657800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 657900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 658000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 658100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 658200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 658300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 658400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 658500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 658600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 658700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 658800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 658900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 659000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 659100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 659200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 659300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 659400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 659500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 659600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 659700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 659800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 659900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 660000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03192138671875, - "eval_runtime": 3088.6861, - "eval_samples_per_second": 364.143, - "eval_steps_per_second": 22.759, - "step": 660000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 660100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 660200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 660300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 660400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 660500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 660600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 660700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 660800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 660900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 661000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 661100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 661200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 661300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 661400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 661500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 661600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 661700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 661800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 661900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 662000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 662100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 662200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 662300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 662400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 662500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 662600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 662700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 662800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 662900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 663000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 663100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 663200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 663300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 663400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 663500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 663600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 663700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 663800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 663900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 664000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 664100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 664200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 664300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 664400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 664500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 664600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 664700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 664800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 664900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 665000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 665100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 665200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 665300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 665400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 665500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 665600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 665700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 665800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 665900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 666000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 666100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 666200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 666300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 666400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 666500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 666600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 666700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 666800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 666900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 667000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 667100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 667200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 667300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 667400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 667500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 667600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 667700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 667800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 667900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 668000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 668100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 668200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 668300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 668400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 668500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 668600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 668700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 668800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 668900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 669000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 669100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 669200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 669300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 669400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 669500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 669600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 669700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 669800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 669900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 670000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 670100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 670200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 670300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 670400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 670500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 670600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 670700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 670800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 670900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 671000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 671100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 671200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 671300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 671400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 671500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 671600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 671700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 671800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 671900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 672000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 672100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 672200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 672300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 672400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 672500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 672600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 672700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 672800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 672900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 673000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 673100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 673200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 673300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 673400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 673500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 673600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 673700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 673800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 673900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 674000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 674100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 674200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 674300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 674400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 674500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 674600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 674700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 674800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 674900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 675000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 675100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 675200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 675300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 675400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 675500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 675600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 675700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 675800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 675900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 676000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 676100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 676200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 676300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 676400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 676500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 676600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 676700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 676800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 676900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 677000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 677100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 677200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 677300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 677400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 677500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 677600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 677700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 677800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 677900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 678000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0393, - "step": 678100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 678200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 678300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 678400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 678500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 678600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 678700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 678800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 678900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 679000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 679100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 679200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 679300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 679400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 679500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 679600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 679700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 679800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 679900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 680000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03179931640625, - "eval_runtime": 3153.5498, - "eval_samples_per_second": 356.653, - "eval_steps_per_second": 22.291, - "step": 680000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 680100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 680200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 680300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 680400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 680500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 680600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 680700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 680800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 680900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 681000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 681100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 681200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 681300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 681400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 681500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 681600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 681700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 681800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 681900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 682000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 682100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 682200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 682300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 682400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 682500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 682600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 682700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 682800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 682900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 683000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 683100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 683200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 683300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 683400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 683500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 683600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 683700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 683800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 683900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 684000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 684100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 684200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 684300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 684400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 684500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 684600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 684700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 684800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 684900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 685000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 685100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 685200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 685300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 685400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 685500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 685600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 685700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 685800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 685900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 686000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 686100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 686200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 686300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 686400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 686500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 686600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 686700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 686800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 686900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 687000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 687100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 687200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 687300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 687400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 687500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 687600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 687700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 687800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 687900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 688000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 688100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 688200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 688300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 688400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 688500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 688600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 688700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 688800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 688900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 689000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 689100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 689200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 689300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 689400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 689500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 689600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 689700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 689800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 689900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 690000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 690100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 690200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 690300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 690400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 690500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 690600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 690700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 690800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 690900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 691000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 691100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 691200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 691300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 691400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 691500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 691600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 691700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 691800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 691900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 692000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 692100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 692200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 692300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 692400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 692500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 692600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 692700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 692800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 692900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 693000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 693100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 693200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 693300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 693400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 693500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 693600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 693700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 693800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 693900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 694000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 694100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 694200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 694300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 694400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 694500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 694600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 694700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 694800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 694900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 695000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 695100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 695200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 695300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 695400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 695500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 695600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 695700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 695800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 695900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 696000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 696100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 696200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 696300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 696400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 696500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 696600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 696700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 696800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 696900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 697000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 697100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 697200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 697300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 697400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 697500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 697600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 697700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 697800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 697900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 698000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 698100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 698200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 698300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 698400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 698500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 698600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 698700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 698800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 698900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 699000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 699100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 699200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 699300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 699400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 699500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 699600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 699700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 699800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 699900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 700000 - }, - { - "epoch": 0.0, - "eval_loss": 0.032257080078125, - "eval_runtime": 3135.6458, - "eval_samples_per_second": 358.689, - "eval_steps_per_second": 22.418, - "step": 700000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 700100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 700200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 700300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 700400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 700500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 700600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 700700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 700800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 700900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 701000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 701100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 701200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 701300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 701400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 701500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 701600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 701700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 701800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 701900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 702000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 702100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 702200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 702300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 702400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 702500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 702600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 702700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 702800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 702900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 703000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 703100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 703200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 703300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 703400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 703500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 703600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 703700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 703800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 703900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 704000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 704100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 704200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 704300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 704400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 704500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 704600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 704700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 704800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 704900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 705000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 705100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 705200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 705300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 705400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 705500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 705600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 705700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 705800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 705900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 706000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 706100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 706200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 706300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 706400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 706500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 706600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 706700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 706800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 706900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 707000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 707100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 707200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 707300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 707400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 707500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 707600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 707700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 707800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 707900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 708000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 708100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 708200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 708300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 708400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 708500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 708600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 708700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 708800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 708900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 709000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 709100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 709200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 709300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 709400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 709500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 709600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 709700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 709800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 709900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 710000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 710100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 710200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 710300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 710400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 710500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 710600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 710700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 710800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 710900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 711000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 711100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 711200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 711300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 711400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 711500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 711600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 711700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 711800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 711900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 712000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 712100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 712200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 712300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 712400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 712500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 712600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 712700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 712800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 712900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 713000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 713100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 713200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 713300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 713400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 713500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 713600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 713700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 713800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 713900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 714000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 714100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 714200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 714300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 714400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 714500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 714600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 714700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 714800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 714900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 715000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 715100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 715200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 715300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 715400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 715500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 715600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 715700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 715800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 715900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 716000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 716100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 716200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 716300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 716400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 716500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 716600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 716700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 716800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 716900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 717000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 717100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 717200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 717300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 717400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 717500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 717600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 717700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 717800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 717900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 718000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 718100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 718200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 718300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 718400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 718500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 718600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 718700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 718800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 718900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 719000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 719100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 719200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 719300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 719400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 719500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 719600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 719700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 719800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 719900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 720000 - }, - { - "epoch": 0.0, - "eval_loss": 0.031890869140625, - "eval_runtime": 3100.2132, - "eval_samples_per_second": 362.789, - "eval_steps_per_second": 22.675, - "step": 720000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 720100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 720200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 720300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 720400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 720500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 720600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 720700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 720800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 720900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 721000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 721100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 721200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 721300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 721400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 721500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 721600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 721700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 721800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 721900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 722000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 722100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 722200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 722300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 722400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 722500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 722600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 722700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 722800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 722900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 723000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 723100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 723200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 723300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 723400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 723500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 723600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 723700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 723800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 723900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 724000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 724100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 724200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 724300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 724400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 724500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 724600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 724700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 724800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 724900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 725000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 725100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 725200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 725300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 725400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 725500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 725600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 725700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 725800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 725900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 726000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 726100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 726200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 726300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 726400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 726500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 726600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 726700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 726800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 726900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 727000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 727100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 727200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 727300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 727400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 727500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 727600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 727700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 727800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 727900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 728000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 728100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 728200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 728300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 728400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 728500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 728600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 728700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 728800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 728900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 729000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 729100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 729200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 729300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 729400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 729500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 729600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 729700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 729800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 729900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 730000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 730100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 730200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 730300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 730400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 730500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 730600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 730700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 730800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 730900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 731000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 731100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 731200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 731300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 731400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 731500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 731600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 731700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 731800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 731900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 732000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 732100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 732200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 732300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 732400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 732500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 732600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 732700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 732800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 732900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 733000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 733100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 733200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 733300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 733400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 733500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 733600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 733700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 733800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 733900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 734000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 734100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 734200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 734300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 734400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 734500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 734600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 734700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 734800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 734900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 735000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 735100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 735200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 735300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 735400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 735500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 735600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 735700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 735800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 735900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 736000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 736100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 736200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 736300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 736400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 736500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 736600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 736700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 736800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 736900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 737000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 737100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 737200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 737300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 737400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 737500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 737600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 737700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 737800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 737900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 738000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 738100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 738200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 738300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 738400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 738500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 738600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 738700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 738800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 738900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 739000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 739100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 739200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 739300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 739400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 739500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 739600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 739700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 739800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 739900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 740000 - }, - { - "epoch": 0.0, - "eval_loss": 0.031463623046875, - "eval_runtime": 3193.1308, - "eval_samples_per_second": 352.232, - "eval_steps_per_second": 22.015, - "step": 740000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 740100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 740200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 740300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 740400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 740500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 740600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 740700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 740800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 740900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 741000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 741100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 741200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 741300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 741400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 741500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 741600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 741700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 741800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 741900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 742000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 742100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 742200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 742300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 742400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 742500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 742600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 742700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 742800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 742900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 743000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 743100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 743200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 743300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 743400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 743500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 743600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 743700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 743800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 743900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 744000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 744100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 744200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 744300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 744400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 744500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 744600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 744700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 744800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 744900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 745000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 745100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 745200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 745300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 745400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 745500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 745600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 745700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 745800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 745900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 746000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 746100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 746200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 746300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 746400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 746500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 746600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 746700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 746800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 746900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 747000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 747100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 747200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 747300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 747400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 747500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 747600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 747700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 747800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 747900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 748000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 748100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 748200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 748300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 748400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 748500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 748600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 748700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 748800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 748900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 749000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 749100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 749200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 749300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 749400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 749500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 749600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 749700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 749800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 749900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 750000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 750100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 750200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 750300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 750400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 750500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 750600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 750700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 750800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 750900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 751000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 751100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 751200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 751300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 751400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 751500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 751600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 751700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 751800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 751900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 752000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 752100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 752200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 752300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 752400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 752500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 752600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 752700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 752800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 752900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 753000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 753100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 753200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 753300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 753400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 753500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 753600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 753700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 753800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 753900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 754000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 754100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 754200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 754300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 754400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 754500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 754600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 754700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 754800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 754900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 755000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 755100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 755200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 755300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 755400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 755500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 755600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 755700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 755800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 755900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 756000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 756100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 756200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 756300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 756400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 756500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 756600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 756700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 756800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 756900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 757000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 757100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 757200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 757300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 757400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 757500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 757600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 757700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 757800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 757900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 758000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 758100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 758200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 758300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 758400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 758500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 758600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 758700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 758800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 758900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 759000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 759100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 759200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 759300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 759400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 759500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 759600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 759700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 759800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 759900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 760000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03131103515625, - "eval_runtime": 3049.8434, - "eval_samples_per_second": 368.781, - "eval_steps_per_second": 23.049, - "step": 760000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 760100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 760200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 760300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 760400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 760500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 760600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 760700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 760800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 760900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 761000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 761100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 761200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 761300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 761400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 761500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 761600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 761700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 761800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 761900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 762000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 762100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 762200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 762300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 762400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 762500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 762600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 762700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 762800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 762900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 763000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 763100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 763200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 763300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 763400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 763500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 763600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 763700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 763800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 763900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 764000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 764100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 764200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 764300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 764400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 764500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 764600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 764700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 764800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 764900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 765000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 765100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 765200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 765300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 765400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 765500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 765600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 765700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 765800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 765900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 766000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 766100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 766200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 766300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 766400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 766500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 766600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 766700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 766800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 766900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 767000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 767100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 767200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 767300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 767400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 767500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 767600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 767700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 767800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 767900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 768000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 768100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 768200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 768300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 768400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 768500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 768600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 768700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 768800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 768900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 769000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 769100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 769200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 769300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 769400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 769500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 769600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 769700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 769800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 769900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 770000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 770100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 770200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 770300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 770400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 770500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 770600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 770700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 770800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 770900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 771000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 771100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 771200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 771300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 771400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 771500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 771600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 771700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 771800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 771900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 772000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 772100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 772200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 772300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 772400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 772500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 772600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 772700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 772800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 772900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 773000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 773100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 773200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 773300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 773400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 773500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 773600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 773700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 773800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 773900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 774000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 774100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 774200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 774300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 774400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 774500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 774600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 774700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 774800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 774900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 775000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 775100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 775200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 775300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 775400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 775500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 775600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 775700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 775800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 775900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 776000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 776100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 776200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 776300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 776400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 776500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 776600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 776700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 776800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 776900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 777000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 777100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 777200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 777300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 777400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 777500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 777600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 777700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 777800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 777900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 778000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 778100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 778200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 778300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 778400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 778500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 778600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 778700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 778800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 778900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 779000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 779100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 779200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 779300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 779400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 779500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 779600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 779700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 779800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 779900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 780000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0312347412109375, - "eval_runtime": 3485.0614, - "eval_samples_per_second": 322.727, - "eval_steps_per_second": 20.171, - "step": 780000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 780100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 780200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 780300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 780400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 780500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 780600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 780700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 780800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 780900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 781000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 781100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 781200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 781300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 781400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 781500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 781600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 781700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 781800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 781900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 782000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 782100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 782200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 782300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 782400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 782500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 782600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 782700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 782800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 782900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 783000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 783100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 783200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 783300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 783400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 783500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 783600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 783700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 783800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 783900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 784000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 784100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 784200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 784300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 784400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 784500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 784600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 784700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 784800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 784900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 785000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 785100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 785200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 785300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 785400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 785500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 785600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 785700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 785800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 785900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 786000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 786100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 786200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 786300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 786400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 786500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 786600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 786700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 786800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 786900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 787000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 787100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 787200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 787300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 787400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 787500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 787600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 787700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 787800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 787900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 788000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 788100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 788200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 788300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 788400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 788500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 788600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 788700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 788800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 788900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 789000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 789100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 789200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 789300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 789400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 789500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 789600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 789700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 789800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 789900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 790000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 790100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 790200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 790300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 790400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 790500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 790600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 790700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 790800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 790900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 791000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 791100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 791200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 791300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 791400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 791500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 791600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 791700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 791800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 791900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 792000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 792100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 792200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 792300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 792400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 792500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 792600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 792700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 792800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 792900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 793000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 793100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 793200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 793300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 793400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 793500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 793600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 793700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 793800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 793900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 794000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 794100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 794200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 794300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 794400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 794500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 794600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 794700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 794800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 794900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 795000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 795100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 795200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 795300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 795400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 795500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 795600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 795700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 795800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 795900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 796000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 796100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 796200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 796300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 796400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 796500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 796600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 796700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 796800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 796900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 797000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 797100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 797200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 797300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 797400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 797500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 797600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 797700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 797800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 797900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 798000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 798100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 798200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 798300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 798400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 798500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 798600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 798700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 798800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 798900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 799000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 799100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 799200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 799300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 799400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 799500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 799600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 799700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 799800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 799900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 800000 - }, - { - "epoch": 0.0, - "eval_loss": 0.031463623046875, - "eval_runtime": 3435.6458, - "eval_samples_per_second": 327.369, - "eval_steps_per_second": 20.461, - "step": 800000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 800100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 800200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 800300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 800400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 800500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 800600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 800700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 800800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 800900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 801000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 801100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 801200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 801300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 801400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 801500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 801600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 801700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 801800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 801900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 802000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 802100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 802200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 802300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 802400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 802500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 802600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 802700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 802800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 802900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 803000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 803100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 803200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 803300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 803400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 803500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 803600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 803700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 803800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 803900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 804000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 804100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 804200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 804300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 804400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 804500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 804600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 804700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 804800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 804900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 805000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 805100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 805200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 805300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 805400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 805500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 805600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 805700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 805800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 805900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 806000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 806100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 806200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 806300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 806400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 806500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 806600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 806700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 806800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 806900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 807000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 807100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 807200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 807300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 807400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 807500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 807600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 807700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 807800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 807900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 808000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 808100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 808200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 808300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 808400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 808500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 808600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 808700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 808800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 808900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 809000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 809100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 809200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 809300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 809400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 809500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 809600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 809700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 809800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 809900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 810000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 810100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 810200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 810300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 810400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 810500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 810600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 810700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 810800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 810900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 811000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 811100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 811200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 811300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 811400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 811500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 811600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 811700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 811800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 811900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 812000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 812100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 812200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 812300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 812400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 812500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 812600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 812700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 812800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 812900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 813000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 813100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 813200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 813300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 813400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 813500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 813600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 813700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 813800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 813900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 814000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 814100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 814200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 814300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 814400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 814500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 814600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 814700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 814800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 814900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 815000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 815100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 815200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 815300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 815400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 815500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 815600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 815700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 815800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 815900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 816000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 816100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 816200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 816300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 816400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 816500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 816600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 816700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 816800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 816900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 817000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 817100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 817200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 817300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 817400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 817500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 817600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 817700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 817800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 817900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 818000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 818100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 818200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 818300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 818400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 818500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 818600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 818700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 818800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 818900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 819000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 819100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 819200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 819300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 819400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 819500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 819600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 819700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 819800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 819900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 820000 - }, - { - "epoch": 0.0, - "eval_loss": 0.031341552734375, - "eval_runtime": 3287.9169, - "eval_samples_per_second": 342.078, - "eval_steps_per_second": 21.38, - "step": 820000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 820100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 820200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 820300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 820400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 820500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 820600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 820700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 820800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 820900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 821000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 821100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 821200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 821300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 821400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 821500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 821600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 821700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 821800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 821900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 822000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 822100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 822200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 822300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 822400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 822500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 822600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 822700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 822800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 822900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 823000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 823100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 823200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 823300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 823400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 823500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 823600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 823700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 823800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 823900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 824000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 824100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 824200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 824300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 824400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 824500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 824600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 824700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 824800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 824900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 825000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 825100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 825200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 825300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 825400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 825500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 825600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 825700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 825800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 825900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 826000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 826100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 826200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 826300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 826400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 826500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 826600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 826700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 826800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 826900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 827000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 827100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 827200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 827300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 827400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 827500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 827600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 827700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 827800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 827900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 828000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 828100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 828200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 828300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 828400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 828500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 828600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 828700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 828800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 828900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 829000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 829100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 829200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 829300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 829400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 829500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 829600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 829700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 829800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 829900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 830000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 830100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 830200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 830300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 830400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 830500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 830600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 830700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 830800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 830900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 831000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 831100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 831200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 831300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 831400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 831500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 831600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 831700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 831800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 831900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 832000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 832100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 832200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 832300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 832400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 832500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 832600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 832700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 832800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 832900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 833000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 833100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 833200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 833300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 833400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 833500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 833600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 833700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 833800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 833900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 834000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 834100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 834200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 834300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 834400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 834500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 834600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 834700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 834800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 834900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 835000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 835100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 835200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 835300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 835400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 835500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 835600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 835700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 835800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 835900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 836000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 836100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 836200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 836300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 836400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 836500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 836600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 836700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 836800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 836900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 837000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 837100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 837200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 837300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 837400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 837500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 837600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 837700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 837800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 837900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 838000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 838100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 838200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 838300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 838400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 838500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 838600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 838700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 838800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 838900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 839000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 839100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 839200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 839300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 839400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 839500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 839600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 839700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 839800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 839900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 840000 - }, - { - "epoch": 0.0, - "eval_loss": 0.031341552734375, - "eval_runtime": 3015.0718, - "eval_samples_per_second": 373.034, - "eval_steps_per_second": 23.315, - "step": 840000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 840100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 840200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 840300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 840400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 840500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 840600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 840700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 840800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 840900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 841000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 841100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 841200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 841300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 841400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 841500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 841600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 841700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 841800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 841900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 842000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 842100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 842200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 842300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 842400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 842500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 842600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 842700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 842800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 842900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 843000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 843100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 843200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 843300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 843400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 843500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 843600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 843700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 843800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 843900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 844000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 844100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 844200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 844300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 844400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 844500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 844600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 844700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 844800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 844900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 845000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 845100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 845200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 845300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 845400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 845500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 845600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 845700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 845800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 845900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 846000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 846100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 846200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 846300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 846400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 846500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 846600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 846700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 846800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 846900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 847000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 847100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 847200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 847300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 847400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 847500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 847600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 847700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 847800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 847900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 848000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 848100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 848200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 848300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 848400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 848500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 848600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 848700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 848800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 848900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 849000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 849100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 849200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 849300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 849400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 849500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 849600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 849700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 849800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 849900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 850000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 850100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 850200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 850300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 850400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 850500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 850600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 850700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 850800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 850900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 851000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 851100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 851200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 851300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 851400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 851500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 851600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 851700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 851800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 851900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 852000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 852100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 852200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 852300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 852400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 852500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 852600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 852700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 852800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 852900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 853000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 853100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 853200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 853300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 853400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 853500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 853600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 853700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 853800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 853900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 854000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 854100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 854200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 854300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 854400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 854500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 854600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 854700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 854800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 854900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 855000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 855100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 855200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 855300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 855400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 855500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 855600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 855700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 855800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 855900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 856000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 856100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 856200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 856300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 856400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 856500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 856600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 856700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 856800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 856900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 857000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 857100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 857200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 857300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 857400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 857500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 857600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 857700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 857800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 857900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 858000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 858100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 858200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 858300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 858400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 858500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 858600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 858700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 858800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 858900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 859000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 859100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 859200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 859300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 859400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 859500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 859600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 859700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 859800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 859900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 860000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0307769775390625, - "eval_runtime": 3051.3726, - "eval_samples_per_second": 368.596, - "eval_steps_per_second": 23.038, - "step": 860000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 860100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 860200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 860300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 860400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 860500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 860600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 860700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 860800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 860900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 861000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 861100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 861200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 861300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 861400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 861500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 861600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 861700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 861800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 861900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 862000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 862100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 862200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 862300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 862400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 862500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 862600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 862700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 862800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 862900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 863000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 863100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 863200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 863300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 863400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 863500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 863600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 863700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 863800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 863900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 864000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 864100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 864200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 864300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 864400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 864500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 864600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 864700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 864800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 864900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 865000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 865100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 865200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 865300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 865400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 865500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 865600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 865700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 865800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 865900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 866000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 866100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 866200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 866300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 866400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 866500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 866600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 866700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 866800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 866900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 867000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 867100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 867200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 867300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 867400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 867500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 867600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 867700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 867800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 867900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 868000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 868100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 868200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 868300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 868400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 868500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 868600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 868700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 868800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 868900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 869000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 869100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 869200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 869300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 869400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 869500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 869600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 869700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 869800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 869900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 870000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 870100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 870200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 870300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 870400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 870500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 870600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 870700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 870800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 870900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 871000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 871100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 871200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 871300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 871400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 871500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 871600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 871700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 871800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 871900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 872000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 872100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 872200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 872300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 872400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 872500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 872600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 872700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 872800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 872900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 873000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 873100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 873200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 873300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 873400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 873500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 873600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 873700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 873800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 873900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 874000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 874100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 874200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 874300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 874400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 874500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 874600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 874700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 874800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 874900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 875000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 875100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 875200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 875300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 875400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 875500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 875600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 875700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 875800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 875900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 876000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 876100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 876200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 876300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 876400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 876500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 876600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 876700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 876800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 876900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 877000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 877100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 877200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 877300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 877400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 877500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 877600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 877700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 877800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 877900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 878000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 878100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 878200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 878300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 878400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 878500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 878600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 878700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 878800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 878900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 879000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 879100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 879200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 879300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 879400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 879500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 879600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 879700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 879800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 879900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 880000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03094482421875, - "eval_runtime": 3080.9329, - "eval_samples_per_second": 365.059, - "eval_steps_per_second": 22.816, - "step": 880000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 880100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 880200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 880300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 880400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 880500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 880600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 880700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 880800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 880900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 881000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 881100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 881200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 881300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 881400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 881500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 881600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 881700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 881800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 881900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 882000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 882100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 882200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 882300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 882400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 882500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 882600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 882700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 882800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 882900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 883000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 883100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 883200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 883300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 883400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 883500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 883600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 883700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 883800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 883900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 884000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 884100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 884200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 884300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 884400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 884500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 884600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 884700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 884800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 884900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 885000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 885100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 885200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 885300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 885400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 885500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 885600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 885700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 885800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 885900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 886000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 886100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 886200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 886300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 886400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 886500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 886600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 886700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 886800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 886900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 887000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 887100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 887200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 887300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 887400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 887500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 887600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 887700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 887800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 887900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 888000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 888100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 888200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 888300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 888400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 888500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 888600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 888700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 888800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 888900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 889000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 889100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 889200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 889300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 889400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 889500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 889600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 889700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 889800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 889900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 890000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 890100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 890200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 890300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 890400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 890500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 890600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 890700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 890800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 890900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 891000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 891100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 891200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 891300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 891400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 891500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 891600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 891700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 891800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 891900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 892000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 892100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 892200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 892300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 892400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 892500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 892600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 892700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 892800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 892900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 893000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 893100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 893200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 893300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 893400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 893500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 893600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 893700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 893800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 893900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 894000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 894100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 894200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 894300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 894400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 894500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 894600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 894700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 894800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 894900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 895000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 895100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 895200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 895300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 895400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 895500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 895600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 895700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 895800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 895900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 896000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 896100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 896200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 896300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 896400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 896500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 896600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 896700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 896800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 896900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 897000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 897100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 897200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 897300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 897400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 897500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 897600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 897700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 897800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 897900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 898000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 898100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 898200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 898300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 898400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 898500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 898600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 898700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 898800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 898900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 899000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 899100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 899200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 899300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 899400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 899500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 899600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 899700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 899800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 899900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 900000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0309295654296875, - "eval_runtime": 3200.3969, - "eval_samples_per_second": 351.432, - "eval_steps_per_second": 21.965, - "step": 900000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 900100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 900200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 900300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 900400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 900500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 900600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 900700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 900800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 900900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 901000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 901100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 901200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 901300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 901400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 901500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 901600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 901700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 901800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 901900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 902000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 902100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 902200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 902300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 902400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 902500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 902600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 902700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 902800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 902900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 903000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 903100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 903200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 903300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 903400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 903500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 903600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 903700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 903800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 903900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 904000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 904100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 904200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 904300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 904400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 904500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 904600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 904700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 904800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 904900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 905000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 905100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 905200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 905300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 905400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 905500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 905600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 905700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 905800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 905900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 906000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 906100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 906200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 906300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 906400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 906500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 906600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 906700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 906800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 906900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 907000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 907100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 907200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 907300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 907400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 907500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 907600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 907700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 907800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 907900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 908000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 908100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 908200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 908300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 908400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 908500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 908600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 908700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 908800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 908900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 909000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 909100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 909200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 909300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 909400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 909500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 909600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 909700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 909800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 909900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 910000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 910100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 910200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 910300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 910400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 910500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 910600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 910700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 910800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 910900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 911000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 911100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 911200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 911300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 911400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 911500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 911600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 911700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 911800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 911900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 912000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 912100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 912200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 912300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 912400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 912500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 912600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 912700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 912800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 912900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 913000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 913100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 913200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 913300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 913400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 913500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 913600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 913700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 913800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 913900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 914000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 914100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 914200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 914300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 914400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 914500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 914600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 914700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 914800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 914900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 915000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 915100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 915200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 915300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 915400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 915500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 915600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 915700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 915800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 915900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 916000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 916100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 916200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 916300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 916400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 916500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 916600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 916700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 916800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 916900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 917000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 917100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 917200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 917300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 917400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 917500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 917600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 917700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 917800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 917900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 918000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 918100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 918200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 918300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 918400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 918500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 918600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 918700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 918800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 918900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 919000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 919100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 919200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 919300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 919400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 919500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 919600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 919700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 919800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 919900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 920000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0310516357421875, - "eval_runtime": 3211.005, - "eval_samples_per_second": 350.271, - "eval_steps_per_second": 21.892, - "step": 920000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 920100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 920200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 920300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 920400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 920500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 920600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 920700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 920800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 920900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 921000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 921100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 921200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 921300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 921400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 921500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 921600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 921700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 921800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 921900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 922000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 922100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 922200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 922300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 922400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 922500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 922600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 922700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 922800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 922900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 923000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 923100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 923200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 923300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 923400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 923500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 923600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 923700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 923800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 923900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 924000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 924100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 924200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 924300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 924400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 924500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 924600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 924700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 924800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 924900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 925000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 925100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 925200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 925300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 925400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 925500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 925600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 925700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 925800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 925900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 926000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 926100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 926200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 926300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 926400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 926500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 926600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 926700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 926800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 926900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 927000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 927100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 927200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 927300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 927400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 927500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 927600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 927700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 927800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 927900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 928000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 928100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 928200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 928300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 928400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 928500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 928600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 928700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 928800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 928900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 929000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 929100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 929200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 929300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 929400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 929500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 929600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 929700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 929800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 929900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 930000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 930100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 930200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 930300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 930400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 930500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 930600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 930700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 930800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 930900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 931000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 931100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 931200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 931300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 931400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 931500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 931600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 931700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 931800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 931900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 932000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 932100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 932200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 932300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 932400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 932500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 932600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 932700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 932800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 932900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 933000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 933100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 933200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 933300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 933400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 933500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 933600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 933700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 933800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 933900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 934000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 934100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 934200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 934300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 934400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 934500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 934600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 934700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 934800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 934900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 935000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 935100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 935200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 935300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 935400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 935500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 935600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 935700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 935800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 935900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 936000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 936100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 936200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 936300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 936400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 936500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 936600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 936700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 936800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 936900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 937000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 937100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 937200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 937300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 937400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 937500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 937600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 937700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 937800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 937900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 938000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 938100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 938200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 938300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 938400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 938500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 938600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 938700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 938800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 938900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 939000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 939100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 939200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 939300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 939400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 939500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 939600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 939700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 939800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 939900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 940000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0307159423828125, - "eval_runtime": 3409.0727, - "eval_samples_per_second": 329.921, - "eval_steps_per_second": 20.62, - "step": 940000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 940100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 940200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 940300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 940400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 940500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 940600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 940700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 940800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 940900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 941000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 941100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 941200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 941300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 941400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 941500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 941600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 941700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 941800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 941900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 942000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 942100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 942200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 942300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 942400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 942500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 942600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 942700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 942800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 942900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 943000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 943100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 943200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 943300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 943400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 943500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 943600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 943700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 943800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 943900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 944000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 944100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 944200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 944300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 944400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 944500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 944600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 944700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 944800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 944900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 945000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 945100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 945200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 945300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 945400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 945500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 945600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 945700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 945800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 945900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 946000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 946100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 946200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 946300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 946400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 946500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 946600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 946700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 946800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 946900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 947000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 947100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 947200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 947300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 947400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 947500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 947600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 947700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 947800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 947900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 948000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 948100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 948200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 948300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 948400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 948500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 948600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 948700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 948800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 948900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 949000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 949100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 949200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 949300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 949400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 949500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 949600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 949700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 949800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 949900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 950000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 950100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 950200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 950300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 950400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 950500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 950600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 950700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 950800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 950900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 951000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 951100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 951200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 951300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 951400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 951500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 951600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 951700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 951800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 951900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 952000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 952100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 952200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 952300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 952400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 952500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 952600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 952700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 952800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 952900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 953000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 953100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 953200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 953300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 953400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 953500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 953600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 953700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 953800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 953900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 954000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 954100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 954200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 954300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 954400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 954500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 954600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 954700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 954800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 954900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 955000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 955100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 955200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 955300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 955400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 955500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 955600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 955700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 955800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 955900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 956000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 956100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 956200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 956300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 956400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 956500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 956600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 956700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 956800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 956900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 957000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 957100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 957200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 957300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 957400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 957500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 957600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 957700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 957800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 957900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 958000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 958100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 958200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 958300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 958400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 958500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 958600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 958700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 958800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 958900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 959000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 959100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 959200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 959300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 959400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 959500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 959600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 959700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 959800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 959900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 960000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0309295654296875, - "eval_runtime": 3121.7204, - "eval_samples_per_second": 360.289, - "eval_steps_per_second": 22.518, - "step": 960000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 960100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 960200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 960300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 960400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 960500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 960600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 960700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 960800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 960900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 961000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 961100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 961200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 961300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 961400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 961500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 961600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 961700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 961800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 961900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 962000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 962100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 962200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 962300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 962400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 962500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 962600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 962700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 962800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 962900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 963000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 963100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 963200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 963300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 963400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 963500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 963600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 963700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 963800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 963900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 964000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 964100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 964200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 964300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 964400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 964500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 964600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 964700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 964800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 964900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 965000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 965100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 965200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 965300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 965400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 965500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 965600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 965700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 965800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 965900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 966000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 966100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 966200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 966300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 966400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 966500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 966600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 966700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 966800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 966900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 967000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 967100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 967200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 967300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 967400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 967500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 967600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 967700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 967800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 967900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 968000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 968100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 968200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 968300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 968400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 968500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 968600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 968700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 968800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 968900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 969000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 969100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 969200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 969300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 969400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 969500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 969600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 969700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 969800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 969900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 970000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 970100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 970200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 970300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 970400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 970500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 970600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 970700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 970800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 970900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 971000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 971100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 971200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 971300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 971400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 971500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 971600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 971700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 971800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 971900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 972000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 972100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 972200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 972300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 972400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 972500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 972600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 972700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 972800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 972900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 973000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 973100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 973200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 973300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 973400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 973500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 973600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 973700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 973800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 973900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 974000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 974100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 974200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 974300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 974400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 974500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 974600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 974700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 974800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 974900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 975000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 975100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 975200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 975300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 975400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 975500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 975600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 975700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 975800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 975900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 976000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 976100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 976200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 976300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 976400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 976500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 976600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 976700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 976800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 976900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 977000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 977100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 977200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 977300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 977400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 977500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 977600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 977700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 977800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 977900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 978000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 978100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 978200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 978300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 978400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 978500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 978600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 978700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 978800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 978900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 979000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 979100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 979200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 979300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 979400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 979500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 979600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 979700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 979800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 979900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 980000 - }, - { - "epoch": 0.0, - "eval_loss": 0.030792236328125, - "eval_runtime": 3092.5375, - "eval_samples_per_second": 363.689, - "eval_steps_per_second": 22.731, - "step": 980000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 980100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 980200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 980300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 980400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 980500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 980600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 980700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 980800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 980900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 981000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 981100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 981200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 981300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 981400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 981500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 981600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 981700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 981800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 981900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 982000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 982100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 982200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 982300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 982400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 982500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 982600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 982700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 982800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 982900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 983000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 983100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 983200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 983300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 983400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 983500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 983600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 983700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 983800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 983900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 984000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 984100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 984200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 984300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 984400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 984500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 984600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 984700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 984800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 984900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 985000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 985100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 985200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 985300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 985400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 985500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 985600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 985700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 985800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 985900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 986000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 986100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 986200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 986300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 986400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 986500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 986600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 986700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 986800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 986900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 987000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 987100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 987200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 987300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 987400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 987500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 987600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 987700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 987800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 987900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 988000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 988100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 988200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 988300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 988400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 988500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 988600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 988700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 988800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 988900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 989000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 989100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 989200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 989300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 989400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 989500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 989600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 989700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 989800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 989900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 990000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 990100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 990200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 990300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 990400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 990500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 990600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 990700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 990800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 990900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 991000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 991100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 991200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 991300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 991400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 991500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 991600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 991700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 991800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 991900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 992000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 992100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 992200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 992300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 992400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 992500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 992600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 992700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 992800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 992900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 993000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 993100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 993200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 993300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 993400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 993500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 993600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 993700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 993800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 993900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 994000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 994100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 994200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 994300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 994400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 994500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 994600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 994700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 994800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 994900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 995000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 995100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 995200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 995300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 995400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 995500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 995600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 995700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 995800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 995900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 996000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 996100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 996200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 996300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 996400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 996500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 996600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 996700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 996800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 996900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 997000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 997100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 997200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 997300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 997400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 997500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 997600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 997700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 997800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 997900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 998000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 998100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 998200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 998300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 998400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 998500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 998600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 998700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 998800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 998900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 999000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 999100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 999200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 999300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 999400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 999500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 999600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 999700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 999800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 999900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1000000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03070068359375, - "eval_runtime": 3196.3803, - "eval_samples_per_second": 351.874, - "eval_steps_per_second": 21.992, - "step": 1000000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1000100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1000200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 1000300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1000400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1000500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1000600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1000700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1000800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1000900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1001000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1001100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1001200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1001300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1001400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1001500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1001600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1001700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1001800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1001900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1002000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1002100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1002200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1002300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 1002400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1002500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1002600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 1002700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1002800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1002900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1003000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1003100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1003200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1003300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1003400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1003500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1003600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1003700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1003800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1003900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1004000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1004100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1004200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1004300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1004400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1004500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1004600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1004700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1004800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1004900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1005000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1005100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1005200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1005300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1005400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1005500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1005600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1005700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1005800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1005900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1006000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1006100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1006200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1006300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1006400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1006500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1006600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 1006700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1006800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1006900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1007000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1007100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1007200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1007300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1007400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1007500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1007600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1007700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1007800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1007900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1008000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1008100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1008200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1008300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1008400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1008500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1008600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1008700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1008800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1008900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1009000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1009100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1009200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1009300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1009400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1009500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1009600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1009700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1009800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1009900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1010000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1010100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1010200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1010300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1010400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1010500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1010600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1010700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1010800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1010900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1011000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1011100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1011200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1011300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1011400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1011500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1011600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1011700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1011800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1011900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1012000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1012100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1012200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1012300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1012400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1012500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1012600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1012700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1012800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1012900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1013000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1013100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1013200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1013300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 1013400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1013500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1013600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1013700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1013800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1013900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1014000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1014100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1014200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1014300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1014400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1014500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1014600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1014700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1014800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1014900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1015000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1015100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1015200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1015300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1015400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1015500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1015600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1015700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1015800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1015900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1016000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1016100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1016200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1016300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1016400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1016500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1016600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1016700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1016800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1016900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1017000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1017100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1017200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1017300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1017400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1017500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1017600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1017700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1017800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1017900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1018000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1018100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1018200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1018300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1018400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1018500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1018600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1018700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1018800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1018900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1019000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1019100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1019200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1019300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1019400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1019500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1019600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1019700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1019800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1019900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1020000 - }, - { - "epoch": 0.0, - "eval_loss": 0.030609130859375, - "eval_runtime": 3266.2908, - "eval_samples_per_second": 344.343, - "eval_steps_per_second": 21.522, - "step": 1020000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1020100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1020200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1020300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1020400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1020500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1020600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1020700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1020800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1020900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1021000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1021100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1021200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1021300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1021400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1021500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1021600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1021700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1021800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1021900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1022000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1022100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1022200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1022300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1022400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1022500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1022600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1022700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1022800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1022900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1023000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1023100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1023200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1023300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1023400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1023500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1023600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1023700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1023800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1023900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1024000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1024100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1024200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1024300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1024400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1024500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1024600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1024700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1024800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1024900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1025000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1025100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1025200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1025300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1025400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1025500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1025600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1025700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1025800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1025900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1026000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 1026100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1026200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1026300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1026400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1026500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1026600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1026700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1026800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 1026900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1027000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1027100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1027200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1027300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1027400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1027500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1027600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1027700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1027800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1027900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1028000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1028100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1028200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1028300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1028400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1028500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1028600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1028700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1028800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1028900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1029000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1029100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1029200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1029300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1029400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1029500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1029600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1029700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1029800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1029900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1030000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1030100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1030200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1030300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1030400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1030500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1030600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1030700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1030800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1030900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1031000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1031100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1031200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1031300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1031400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1031500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1031600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1031700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1031800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1031900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1032000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1032100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1032200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1032300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1032400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1032500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1032600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1032700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1032800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1032900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1033000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1033100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1033200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1033300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1033400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1033500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1033600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1033700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1033800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1033900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1034000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1034100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1034200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1034300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1034400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1034500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1034600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1034700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1034800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1034900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1035000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1035100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1035200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1035300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1035400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1035500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1035600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1035700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1035800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1035900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1036000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1036100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1036200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1036300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1036400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1036500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1036600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1036700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1036800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1036900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1037000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1037100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1037200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1037300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1037400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1037500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1037600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1037700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1037800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1037900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1038000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1038100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1038200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1038300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1038400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1038500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1038600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1038700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1038800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1038900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1039000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1039100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1039200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1039300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1039400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1039500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1039600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1039700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1039800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1039900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1040000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0306549072265625, - "eval_runtime": 3024.1998, - "eval_samples_per_second": 371.908, - "eval_steps_per_second": 23.244, - "step": 1040000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1040100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1040200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1040300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1040400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1040500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1040600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1040700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1040800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1040900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1041000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1041100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1041200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1041300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1041400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1041500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1041600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1041700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1041800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1041900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1042000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1042100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1042200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1042300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1042400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1042500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1042600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1042700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1042800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 1042900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1043000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1043100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1043200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1043300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1043400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1043500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1043600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1043700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1043800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1043900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1044000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1044100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1044200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1044300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1044400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1044500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1044600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1044700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1044800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1044900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1045000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1045100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1045200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1045300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1045400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1045500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1045600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1045700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1045800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1045900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1046000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1046100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1046200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1046300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1046400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1046500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1046600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1046700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1046800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1046900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1047000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1047100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1047200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1047300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1047400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1047500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1047600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1047700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1047800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1047900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1048000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1048100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1048200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1048300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1048400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1048500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1048600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1048700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1048800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1048900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1049000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1049100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1049200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1049300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1049400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1049500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1049600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1049700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1049800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1049900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1050000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1050100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1050200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1050300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1050400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1050500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1050600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1050700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1050800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1050900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1051000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1051100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1051200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1051300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1051400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1051500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1051600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1051700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1051800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1051900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1052000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1052100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1052200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1052300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1052400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1052500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1052600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1052700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1052800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1052900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1053000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1053100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1053200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1053300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1053400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1053500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1053600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1053700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1053800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1053900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1054000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1054100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1054200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1054300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1054400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1054500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1054600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 1054700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1054800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1054900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1055000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1055100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1055200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1055300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1055400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1055500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 1055600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1055700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1055800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1055900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1056000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1056100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1056200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1056300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1056400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1056500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1056600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1056700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1056800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1056900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1057000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1057100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1057200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1057300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1057400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1057500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1057600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1057700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1057800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1057900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1058000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1058100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1058200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1058300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1058400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1058500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1058600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1058700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1058800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1058900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1059000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1059100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1059200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1059300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1059400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1059500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1059600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1059700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1059800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1059900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1060000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0303802490234375, - "eval_runtime": 3085.9361, - "eval_samples_per_second": 364.467, - "eval_steps_per_second": 22.779, - "step": 1060000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1060100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1060200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1060300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1060400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1060500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1060600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1060700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1060800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1060900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1061000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1061100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1061200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1061300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1061400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1061500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1061600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1061700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1061800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1061900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1062000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1062100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1062200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1062300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1062400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1062500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1062600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1062700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1062800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1062900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1063000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1063100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1063200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1063300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1063400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1063500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1063600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1063700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1063800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1063900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1064000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1064100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1064200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1064300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1064400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1064500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1064600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1064700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1064800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1064900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1065000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1065100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1065200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1065300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1065400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1065500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1065600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1065700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1065800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1065900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1066000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1066100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1066200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1066300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1066400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1066500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1066600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1066700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1066800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1066900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1067000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1067100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1067200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1067300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1067400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1067500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1067600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1067700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1067800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1067900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1068000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1068100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1068200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1068300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1068400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1068500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1068600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1068700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 1068800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1068900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1069000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1069100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1069200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1069300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1069400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 1069500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1069600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1069700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1069800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1069900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1070000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1070100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1070200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1070300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1070400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1070500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1070600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1070700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1070800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1070900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1071000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1071100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1071200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1071300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1071400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1071500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1071600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1071700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1071800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1071900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1072000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1072100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1072200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1072300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1072400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1072500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1072600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1072700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1072800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1072900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1073000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1073100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1073200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1073300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1073400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1073500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1073600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1073700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1073800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1073900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1074000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1074100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1074200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 1074300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1074400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1074500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1074600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1074700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1074800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1074900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1075000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1075100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1075200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1075300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1075400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1075500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1075600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1075700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1075800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1075900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1076000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1076100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1076200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1076300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1076400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1076500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1076600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1076700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1076800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1076900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1077000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1077100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1077200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1077300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1077400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1077500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1077600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1077700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1077800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1077900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1078000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1078100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1078200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1078300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1078400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1078500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1078600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1078700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1078800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1078900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1079000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1079100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1079200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1079300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1079400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1079500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1079600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1079700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1079800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1079900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1080000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0302734375, - "eval_runtime": 3131.3287, - "eval_samples_per_second": 359.184, - "eval_steps_per_second": 22.449, - "step": 1080000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1080100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1080200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1080300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1080400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1080500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1080600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1080700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1080800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1080900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1081000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1081100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1081200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1081300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1081400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1081500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1081600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1081700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1081800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1081900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1082000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1082100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1082200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1082300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1082400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1082500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1082600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1082700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1082800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1082900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1083000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1083100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1083200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1083300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1083400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1083500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1083600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1083700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1083800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1083900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1084000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1084100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1084200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1084300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1084400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1084500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1084600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1084700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1084800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1084900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1085000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1085100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1085200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1085300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1085400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1085500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1085600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1085700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1085800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1085900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1086000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1086100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1086200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1086300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1086400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1086500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1086600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1086700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1086800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1086900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1087000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1087100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1087200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1087300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1087400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1087500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1087600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1087700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1087800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1087900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1088000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1088100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1088200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1088300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1088400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1088500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1088600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1088700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1088800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1088900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1089000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1089100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1089200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1089300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1089400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1089500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1089600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1089700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1089800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1089900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1090000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1090100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1090200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1090300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1090400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1090500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1090600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1090700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1090800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1090900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1091000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1091100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1091200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1091300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1091400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1091500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1091600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1091700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1091800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1091900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1092000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1092100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1092200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1092300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1092400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1092500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1092600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1092700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1092800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1092900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1093000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1093100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1093200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1093300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1093400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1093500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1093600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1093700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1093800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1093900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1094000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1094100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1094200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1094300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1094400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1094500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1094600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1094700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1094800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1094900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1095000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1095100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1095200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1095300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1095400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1095500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1095600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1095700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1095800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1095900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1096000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1096100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1096200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1096300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1096400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1096500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1096600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1096700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1096800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1096900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1097000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1097100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1097200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1097300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1097400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1097500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1097600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1097700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1097800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1097900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1098000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1098100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1098200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1098300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1098400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1098500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1098600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1098700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1098800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1098900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1099000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1099100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1099200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1099300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1099400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1099500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1099600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1099700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1099800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1099900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1100000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0305328369140625, - "eval_runtime": 3084.7934, - "eval_samples_per_second": 364.602, - "eval_steps_per_second": 22.788, - "step": 1100000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1100100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1100200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1100300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1100400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1100500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1100600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1100700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1100800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1100900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1101000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1101100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1101200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1101300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1101400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1101500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1101600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1101700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1101800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1101900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1102000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1102100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1102200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1102300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1102400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1102500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1102600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1102700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1102800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1102900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1103000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1103100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1103200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1103300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1103400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1103500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1103600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1103700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1103800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1103900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1104000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1104100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1104200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1104300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1104400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1104500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1104600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1104700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1104800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1104900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1105000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1105100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1105200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1105300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1105400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1105500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1105600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1105700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1105800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1105900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1106000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1106100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1106200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1106300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1106400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1106500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1106600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1106700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1106800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1106900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1107000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1107100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1107200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1107300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1107400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1107500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1107600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1107700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1107800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1107900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1108000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1108100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1108200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1108300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1108400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1108500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1108600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1108700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1108800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1108900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1109000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1109100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1109200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1109300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1109400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1109500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1109600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1109700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1109800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1109900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1110000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1110100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1110200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1110300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1110400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1110500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1110600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1110700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1110800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1110900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1111000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1111100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1111200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1111300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1111400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1111500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1111600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1111700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1111800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1111900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1112000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1112100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1112200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1112300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1112400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1112500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1112600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1112700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1112800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1112900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1113000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1113100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1113200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1113300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1113400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1113500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1113600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1113700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1113800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1113900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1114000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1114100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1114200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1114300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1114400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1114500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1114600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1114700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1114800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1114900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1115000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1115100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1115200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1115300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1115400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1115500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1115600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1115700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1115800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1115900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1116000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1116100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1116200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1116300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1116400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1116500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1116600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1116700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1116800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1116900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1117000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1117100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1117200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1117300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1117400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1117500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1117600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1117700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1117800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1117900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1118000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1118100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1118200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1118300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1118400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1118500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1118600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1118700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1118800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1118900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1119000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1119100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1119200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1119300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1119400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1119500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1119600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1119700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1119800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1119900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1120000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0302276611328125, - "eval_runtime": 3228.7917, - "eval_samples_per_second": 348.342, - "eval_steps_per_second": 21.772, - "step": 1120000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1120100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1120200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1120300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1120400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1120500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1120600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1120700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1120800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1120900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1121000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1121100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1121200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1121300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1121400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1121500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1121600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1121700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1121800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1121900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1122000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1122100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1122200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1122300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1122400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1122500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1122600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1122700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1122800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1122900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1123000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1123100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1123200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1123300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1123400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1123500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1123600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1123700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1123800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1123900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1124000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1124100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1124200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1124300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1124400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1124500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1124600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1124700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1124800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1124900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1125000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1125100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1125200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1125300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1125400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1125500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1125600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1125700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1125800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1125900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1126000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1126100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1126200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1126300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1126400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1126500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1126600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1126700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1126800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1126900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1127000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1127100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1127200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1127300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1127400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1127500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1127600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1127700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1127800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1127900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1128000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1128100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1128200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1128300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1128400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1128500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1128600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1128700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1128800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1128900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1129000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1129100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1129200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1129300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1129400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1129500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1129600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1129700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1129800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1129900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1130000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1130100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1130200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1130300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1130400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1130500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1130600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1130700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1130800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1130900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1131000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1131100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1131200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1131300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1131400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1131500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1131600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1131700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1131800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1131900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1132000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1132100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1132200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1132300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1132400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1132500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1132600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1132700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1132800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1132900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1133000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1133100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1133200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1133300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1133400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1133500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1133600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1133700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1133800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1133900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1134000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1134100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1134200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1134300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1134400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1134500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1134600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1134700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1134800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1134900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1135000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1135100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1135200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1135300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1135400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1135500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1135600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1135700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1135800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1135900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1136000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1136100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1136200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1136300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1136400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1136500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1136600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1136700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1136800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1136900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1137000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1137100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1137200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1137300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1137400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1137500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1137600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1137700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1137800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1137900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1138000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1138100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1138200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1138300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1138400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1138500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1138600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1138700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1138800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1138900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1139000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1139100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1139200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1139300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1139400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1139500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1139600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1139700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1139800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1139900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1140000 - }, - { - "epoch": 0.0, - "eval_loss": 0.030303955078125, - "eval_runtime": 3150.8709, - "eval_samples_per_second": 356.956, - "eval_steps_per_second": 22.31, - "step": 1140000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1140100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1140200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1140300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1140400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1140500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1140600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1140700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1140800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1140900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1141000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1141100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1141200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1141300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1141400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1141500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1141600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1141700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1141800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1141900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1142000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1142100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1142200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1142300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1142400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1142500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1142600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1142700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1142800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1142900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1143000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1143100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1143200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1143300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1143400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1143500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1143600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1143700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1143800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1143900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1144000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1144100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1144200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1144300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1144400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1144500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1144600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1144700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1144800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1144900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1145000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1145100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1145200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1145300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1145400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1145500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1145600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1145700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1145800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1145900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1146000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1146100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1146200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1146300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1146400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1146500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1146600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1146700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1146800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1146900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1147000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1147100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1147200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1147300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1147400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1147500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1147600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1147700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1147800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1147900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1148000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1148100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1148200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1148300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1148400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1148500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1148600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1148700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1148800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1148900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1149000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1149100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1149200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1149300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1149400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1149500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1149600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1149700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1149800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1149900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1150000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1150100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1150200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1150300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1150400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1150500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1150600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1150700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1150800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1150900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1151000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1151100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1151200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1151300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1151400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1151500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1151600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1151700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1151800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1151900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1152000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1152100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1152200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1152300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1152400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1152500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1152600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1152700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1152800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1152900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1153000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1153100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1153200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1153300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1153400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1153500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1153600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1153700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1153800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1153900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1154000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1154100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1154200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1154300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1154400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1154500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1154600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1154700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1154800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1154900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1155000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1155100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1155200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1155300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1155400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1155500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1155600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1155700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1155800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1155900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1156000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1156100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1156200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1156300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1156400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1156500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1156600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1156700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1156800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1156900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1157000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1157100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1157200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1157300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1157400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1157500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1157600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1157700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1157800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1157900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1158000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1158100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1158200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1158300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1158400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1158500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1158600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1158700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1158800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1158900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1159000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1159100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1159200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1159300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1159400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1159500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1159600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1159700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1159800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1159900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1160000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0301055908203125, - "eval_runtime": 3130.1048, - "eval_samples_per_second": 359.324, - "eval_steps_per_second": 22.458, - "step": 1160000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1160100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1160200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1160300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1160400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1160500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1160600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1160700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1160800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1160900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1161000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1161100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1161200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1161300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1161400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1161500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1161600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1161700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1161800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1161900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1162000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1162100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1162200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1162300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1162400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1162500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1162600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1162700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1162800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1162900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1163000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1163100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1163200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1163300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1163400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1163500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1163600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1163700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1163800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1163900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1164000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1164100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1164200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1164300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1164400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1164500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1164600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1164700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1164800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1164900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1165000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1165100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1165200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1165300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1165400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1165500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1165600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1165700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1165800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1165900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1166000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1166100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1166200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1166300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1166400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1166500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1166600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1166700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1166800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1166900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1167000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1167100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1167200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1167300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1167400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1167500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1167600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1167700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1167800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1167900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1168000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1168100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1168200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1168300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1168400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1168500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1168600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1168700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1168800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1168900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1169000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1169100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1169200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1169300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1169400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1169500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1169600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1169700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1169800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1169900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1170000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1170100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1170200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1170300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1170400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1170500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1170600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1170700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1170800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1170900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1171000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1171100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1171200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1171300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1171400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1171500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1171600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1171700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1171800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1171900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1172000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1172100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1172200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1172300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1172400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1172500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1172600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1172700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1172800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1172900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1173000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1173100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1173200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1173300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1173400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1173500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1173600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1173700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1173800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1173900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1174000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1174100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1174200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1174300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1174400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1174500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1174600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1174700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1174800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1174900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1175000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1175100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1175200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1175300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1175400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1175500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1175600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1175700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1175800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1175900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1176000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1176100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1176200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1176300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1176400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1176500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1176600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1176700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1176800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1176900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1177000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1177100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1177200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1177300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1177400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1177500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1177600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1177700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1177800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1177900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1178000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1178100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1178200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1178300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1178400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1178500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1178600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1178700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1178800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1178900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1179000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1179100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1179200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1179300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1179400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1179500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1179600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1179700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1179800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1179900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1180000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02996826171875, - "eval_runtime": 3211.297, - "eval_samples_per_second": 350.239, - "eval_steps_per_second": 21.89, - "step": 1180000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1180100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1180200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1180300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1180400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1180500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1180600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1180700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1180800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1180900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1181000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1181100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1181200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1181300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1181400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1181500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1181600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1181700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1181800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1181900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1182000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1182100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1182200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1182300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1182400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1182500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1182600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1182700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1182800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1182900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1183000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1183100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1183200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1183300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1183400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1183500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1183600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1183700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1183800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1183900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1184000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1184100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1184200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1184300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1184400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1184500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1184600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1184700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1184800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1184900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1185000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1185100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1185200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1185300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1185400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1185500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1185600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1185700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1185800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1185900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1186000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1186100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1186200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1186300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1186400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1186500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1186600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 1186700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1186800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1186900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1187000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1187100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1187200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1187300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1187400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1187500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1187600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1187700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1187800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1187900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1188000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1188100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1188200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1188300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1188400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1188500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1188600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1188700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1188800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1188900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1189000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1189100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1189200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1189300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1189400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1189500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1189600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1189700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1189800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1189900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1190000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1190100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1190200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1190300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1190400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1190500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1190600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1190700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1190800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1190900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1191000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1191100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1191200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1191300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1191400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1191500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1191600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1191700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1191800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1191900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1192000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1192100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1192200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1192300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1192400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1192500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1192600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1192700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1192800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1192900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1193000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1193100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1193200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1193300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1193400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1193500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1193600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1193700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1193800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1193900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1194000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1194100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1194200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1194300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1194400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1194500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1194600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1194700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1194800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1194900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1195000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1195100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1195200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1195300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1195400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1195500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1195600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1195700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1195800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1195900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1196000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1196100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1196200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1196300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1196400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1196500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1196600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1196700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1196800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1196900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1197000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1197100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1197200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1197300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1197400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1197500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1197600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1197700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1197800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1197900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1198000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1198100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1198200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1198300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1198400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1198500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1198600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1198700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1198800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1198900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1199000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1199100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1199200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1199300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1199400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1199500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1199600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1199700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1199800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1199900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1200000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0304412841796875, - "eval_runtime": 4126.4351, - "eval_samples_per_second": 272.565, - "eval_steps_per_second": 17.036, - "step": 1200000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1200100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1200200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1200300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1200400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1200500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1200600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1200700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1200800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1200900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1201000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1201100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1201200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1201300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1201400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1201500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1201600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1201700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1201800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1201900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1202000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1202100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1202200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1202300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1202400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1202500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1202600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1202700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1202800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1202900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1203000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1203100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1203200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1203300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1203400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1203500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1203600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1203700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1203800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1203900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1204000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1204100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1204200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1204300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1204400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1204500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1204600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1204700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1204800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1204900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1205000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1205100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1205200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1205300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1205400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1205500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1205600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1205700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1205800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1205900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1206000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1206100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1206200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1206300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1206400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1206500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1206600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1206700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1206800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1206900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1207000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1207100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1207200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1207300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1207400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1207500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1207600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1207700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1207800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1207900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1208000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1208100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1208200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1208300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1208400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1208500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1208600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1208700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1208800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1208900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1209000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1209100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1209200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1209300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1209400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1209500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1209600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1209700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1209800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1209900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1210000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1210100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1210200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1210300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1210400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1210500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1210600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1210700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1210800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1210900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1211000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1211100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1211200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1211300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1211400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1211500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1211600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1211700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1211800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1211900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1212000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1212100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1212200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1212300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1212400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1212500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1212600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1212700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1212800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1212900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1213000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1213100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1213200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1213300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1213400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1213500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1213600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1213700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1213800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1213900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1214000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1214100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1214200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1214300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1214400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1214500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1214600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1214700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1214800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1214900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1215000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1215100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1215200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1215300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1215400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1215500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1215600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1215700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1215800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1215900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1216000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1216100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1216200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1216300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1216400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1216500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1216600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1216700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1216800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1216900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1217000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1217100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1217200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1217300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1217400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1217500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1217600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1217700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1217800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1217900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1218000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1218100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1218200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1218300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1218400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1218500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1218600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1218700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1218800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1218900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1219000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1219100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1219200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1219300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1219400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1219500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1219600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1219700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1219800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1219900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1220000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0304107666015625, - "eval_runtime": 3976.69, - "eval_samples_per_second": 282.829, - "eval_steps_per_second": 17.677, - "step": 1220000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1220100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1220200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1220300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1220400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1220500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1220600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1220700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1220800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1220900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1221000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1221100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1221200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1221300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1221400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1221500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1221600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1221700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 1221800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1221900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1222000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1222100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1222200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1222300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1222400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1222500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1222600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1222700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1222800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1222900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1223000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1223100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1223200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1223300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1223400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1223500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1223600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1223700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1223800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1223900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1224000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1224100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1224200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1224300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1224400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1224500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1224600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1224700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1224800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1224900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1225000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1225100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1225200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1225300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1225400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1225500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1225600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1225700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1225800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1225900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1226000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1226100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1226200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1226300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1226400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1226500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1226600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1226700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1226800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1226900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1227000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1227100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1227200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1227300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1227400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1227500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1227600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1227700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1227800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1227900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1228000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1228100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1228200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1228300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1228400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1228500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1228600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1228700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1228800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1228900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1229000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1229100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1229200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 1229300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1229400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1229500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0412, - "step": 1229600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1229700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1229800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1229900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1230000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1230100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1230200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1230300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1230400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1230500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1230600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1230700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1230800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1230900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1231000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1231100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1231200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1231300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1231400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1231500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1231600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1231700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1231800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1231900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1232000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1232100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1232200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1232300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1232400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1232500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1232600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1232700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1232800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1232900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1233000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1233100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1233200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1233300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1233400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1233500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1233600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1233700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1233800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1233900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1234000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1234100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1234200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1234300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1234400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1234500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1234600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1234700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1234800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1234900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1235000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1235100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1235200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1235300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1235400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1235500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1235600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1235700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1235800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1235900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1236000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1236100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1236200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1236300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1236400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1236500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1236600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1236700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1236800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1236900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 1237000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1237100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1237200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1237300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1237400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1237500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1237600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1237700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1237800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1237900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1238000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1238100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1238200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1238300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1238400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1238500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1238600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1238700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1238800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1238900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1239000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1239100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1239200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1239300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1239400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1239500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1239600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1239700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1239800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1239900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1240000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03009033203125, - "eval_runtime": 3986.2287, - "eval_samples_per_second": 282.152, - "eval_steps_per_second": 17.635, - "step": 1240000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1240100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1240200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1240300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1240400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1240500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1240600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1240700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1240800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1240900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1241000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1241100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1241200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1241300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1241400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1241500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1241600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1241700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1241800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1241900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1242000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1242100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1242200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1242300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1242400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1242500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1242600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1242700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1242800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1242900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1243000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1243100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1243200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1243300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1243400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1243500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1243600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1243700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1243800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1243900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1244000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1244100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1244200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1244300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1244400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1244500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1244600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1244700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1244800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1244900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1245000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1245100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1245200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1245300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1245400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1245500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1245600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1245700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1245800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1245900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1246000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1246100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1246200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1246300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1246400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1246500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1246600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1246700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1246800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1246900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1247000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1247100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1247200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1247300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1247400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1247500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1247600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1247700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1247800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1247900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1248000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1248100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1248200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1248300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1248400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1248500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1248600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1248700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1248800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1248900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1249000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1249100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1249200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1249300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1249400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1249500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1249600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1249700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1249800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1249900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1250000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1250100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1250200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1250300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1250400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1250500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1250600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1250700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1250800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1250900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1251000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1251100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1251200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1251300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1251400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1251500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1251600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1251700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1251800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1251900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1252000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1252100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1252200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1252300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1252400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1252500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1252600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1252700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1252800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1252900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1253000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1253100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1253200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1253300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1253400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1253500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1253600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1253700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1253800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1253900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1254000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1254100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1254200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1254300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1254400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1254500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1254600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1254700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1254800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1254900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1255000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1255100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1255200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1255300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1255400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1255500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1255600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1255700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1255800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1255900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1256000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1256100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1256200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1256300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1256400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1256500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1256600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1256700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1256800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1256900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1257000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1257100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1257200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1257300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1257400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1257500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1257600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1257700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1257800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1257900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1258000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1258100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1258200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1258300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1258400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1258500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1258600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1258700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1258800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1258900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1259000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1259100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1259200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1259300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1259400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1259500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1259600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1259700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1259800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1259900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1260000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0300445556640625, - "eval_runtime": 3604.2377, - "eval_samples_per_second": 312.056, - "eval_steps_per_second": 19.504, - "step": 1260000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1260100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1260200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1260300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1260400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1260500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1260600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1260700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1260800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1260900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1261000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1261100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1261200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1261300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1261400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1261500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1261600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1261700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1261800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1261900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1262000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1262100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1262200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1262300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1262400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1262500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1262600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1262700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1262800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1262900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1263000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1263100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1263200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1263300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1263400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1263500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1263600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1263700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1263800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1263900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1264000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1264100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1264200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1264300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1264400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1264500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1264600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1264700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1264800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1264900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1265000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1265100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1265200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1265300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1265400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1265500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1265600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1265700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1265800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1265900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1266000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1266100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1266200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1266300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1266400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1266500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1266600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1266700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1266800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1266900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1267000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1267100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1267200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1267300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1267400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1267500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1267600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1267700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1267800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1267900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1268000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1268100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1268200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1268300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1268400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1268500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1268600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1268700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1268800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1268900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1269000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1269100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1269200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1269300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1269400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1269500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1269600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1269700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1269800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1269900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1270000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1270100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1270200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1270300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1270400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1270500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1270600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1270700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1270800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1270900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1271000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1271100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1271200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1271300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1271400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1271500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1271600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1271700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1271800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1271900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1272000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1272100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1272200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1272300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1272400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1272500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1272600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 1272700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 1272800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1272900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1273000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1273100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1273200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1273300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1273400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1273500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1273600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1273700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1273800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1273900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1274000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1274100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1274200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1274300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1274400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1274500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1274600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1274700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1274800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1274900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1275000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1275100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1275200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1275300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1275400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1275500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1275600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1275700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1275800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1275900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1276000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1276100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1276200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1276300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1276400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1276500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1276600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1276700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1276800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1276900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1277000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1277100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1277200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1277300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1277400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1277500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1277600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1277700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1277800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1277900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1278000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1278100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1278200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1278300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1278400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1278500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1278600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1278700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1278800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1278900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1279000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1279100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1279200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1279300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1279400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1279500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1279600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1279700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1279800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1279900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1280000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0305328369140625, - "eval_runtime": 4108.9417, - "eval_samples_per_second": 273.726, - "eval_steps_per_second": 17.108, - "step": 1280000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1280100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1280200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1280300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1280400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1280500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1280600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1280700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1280800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1280900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1281000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1281100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1281200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1281300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1281400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1281500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1281600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1281700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1281800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1281900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1282000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1282100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1282200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1282300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1282400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1282500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1282600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1282700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1282800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1282900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1283000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1283100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1283200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1283300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1283400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1283500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1283600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1283700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1283800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1283900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1284000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 1284100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1284200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1284300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1284400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1284500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1284600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1284700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1284800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1284900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1285000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1285100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1285200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1285300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1285400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1285500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1285600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1285700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1285800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1285900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1286000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1286100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1286200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1286300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1286400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1286500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1286600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1286700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1286800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1286900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1287000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1287100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1287200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1287300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1287400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1287500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1287600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1287700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1287800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1287900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1288000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1288100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1288200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1288300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1288400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1288500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1288600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1288700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1288800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1288900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 1289000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1289100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1289200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1289300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1289400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1289500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1289600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1289700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1289800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1289900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1290000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1290100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1290200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1290300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1290400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1290500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1290600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1290700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1290800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1290900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1291000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1291100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1291200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1291300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1291400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1291500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1291600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1291700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1291800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1291900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1292000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1292100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1292200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1292300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1292400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1292500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1292600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1292700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1292800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1292900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1293000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1293100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1293200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1293300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1293400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1293500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1293600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1293700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1293800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1293900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1294000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1294100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1294200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1294300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1294400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1294500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1294600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1294700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1294800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1294900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1295000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1295100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1295200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1295300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1295400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1295500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1295600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1295700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1295800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1295900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1296000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1296100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1296200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1296300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1296400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1296500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1296600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1296700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1296800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1296900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1297000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1297100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1297200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1297300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1297400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1297500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1297600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1297700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1297800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1297900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1298000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1298100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1298200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1298300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1298400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1298500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1298600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1298700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1298800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1298900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1299000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1299100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1299200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1299300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1299400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1299500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1299600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1299700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1299800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1299900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1300000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0302276611328125, - "eval_runtime": 4462.333, - "eval_samples_per_second": 252.048, - "eval_steps_per_second": 15.753, - "step": 1300000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1300100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1300200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1300300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1300400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1300500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1300600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1300700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1300800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1300900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1301000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1301100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1301200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1301300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1301400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1301500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1301600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1301700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1301800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1301900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1302000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1302100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1302200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1302300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1302400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1302500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1302600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1302700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1302800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1302900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1303000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1303100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1303200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1303300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1303400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1303500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1303600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1303700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1303800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1303900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1304000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1304100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1304200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1304300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1304400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1304500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1304600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1304700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1304800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1304900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1305000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1305100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1305200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1305300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1305400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1305500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1305600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1305700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1305800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1305900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1306000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1306100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1306200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1306300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1306400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1306500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1306600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1306700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1306800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1306900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1307000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1307100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1307200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1307300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1307400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 1307500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1307600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1307700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1307800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1307900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1308000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1308100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1308200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1308300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1308400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1308500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1308600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1308700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1308800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1308900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1309000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1309100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1309200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1309300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1309400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1309500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1309600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1309700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1309800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1309900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1310000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1310100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1310200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1310300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1310400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1310500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1310600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1310700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1310800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1310900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1311000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1311100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1311200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1311300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1311400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1311500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1311600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1311700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1311800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1311900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1312000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1312100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1312200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1312300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1312400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1312500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1312600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1312700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1312800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1312900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1313000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1313100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1313200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1313300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1313400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1313500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1313600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1313700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1313800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1313900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1314000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1314100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1314200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1314300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1314400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1314500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1314600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1314700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1314800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1314900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1315000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1315100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1315200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1315300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1315400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1315500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1315600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1315700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1315800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1315900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1316000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1316100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1316200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1316300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1316400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1316500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1316600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1316700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1316800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1316900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1317000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1317100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1317200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1317300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1317400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1317500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1317600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1317700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1317800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1317900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1318000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1318100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1318200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1318300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1318400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1318500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1318600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1318700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1318800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1318900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1319000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1319100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1319200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1319300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1319400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1319500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1319600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1319700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1319800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1319900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1320000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0299835205078125, - "eval_runtime": 3928.7483, - "eval_samples_per_second": 286.28, - "eval_steps_per_second": 17.893, - "step": 1320000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1320100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1320200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1320300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1320400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1320500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1320600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1320700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1320800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1320900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1321000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1321100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1321200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1321300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1321400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1321500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1321600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1321700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1321800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1321900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1322000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1322100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1322200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1322300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1322400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1322500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1322600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1322700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1322800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1322900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1323000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1323100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1323200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1323300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1323400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1323500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1323600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1323700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1323800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1323900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1324000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1324100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1324200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1324300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1324400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1324500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1324600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1324700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1324800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1324900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1325000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1325100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1325200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1325300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1325400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1325500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1325600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1325700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1325800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1325900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1326000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1326100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1326200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1326300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1326400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1326500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1326600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1326700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1326800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1326900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1327000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1327100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1327200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1327300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1327400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1327500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1327600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1327700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1327800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1327900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1328000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1328100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1328200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1328300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1328400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1328500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1328600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1328700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1328800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1328900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1329000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1329100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1329200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1329300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1329400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1329500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1329600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1329700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1329800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1329900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1330000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1330100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1330200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1330300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1330400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1330500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1330600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1330700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1330800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1330900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1331000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1331100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1331200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1331300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1331400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1331500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1331600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1331700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1331800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1331900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1332000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1332100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1332200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1332300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1332400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1332500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1332600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1332700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1332800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1332900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1333000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1333100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1333200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1333300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1333400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1333500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1333600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1333700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1333800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1333900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1334000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1334100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1334200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1334300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1334400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1334500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1334600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1334700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1334800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1334900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1335000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1335100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1335200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1335300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1335400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1335500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1335600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1335700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1335800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1335900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1336000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1336100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1336200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1336300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1336400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1336500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1336600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1336700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1336800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1336900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1337000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1337100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1337200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1337300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1337400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1337500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1337600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1337700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1337800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1337900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1338000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1338100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1338200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1338300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1338400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1338500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1338600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1338700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1338800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1338900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1339000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1339100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1339200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1339300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1339400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1339500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1339600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1339700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1339800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1339900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1340000 - }, - { - "epoch": 0.0, - "eval_loss": 0.030059814453125, - "eval_runtime": 3710.6583, - "eval_samples_per_second": 303.106, - "eval_steps_per_second": 18.944, - "step": 1340000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1340100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1340200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1340300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1340400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1340500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1340600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1340700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1340800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1340900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1341000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1341100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1341200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1341300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1341400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1341500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1341600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1341700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1341800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1341900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1342000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1342100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1342200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1342300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1342400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1342500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1342600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1342700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1342800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1342900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1343000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1343100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1343200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1343300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1343400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1343500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1343600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1343700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1343800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1343900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1344000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1344100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1344200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1344300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1344400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1344500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1344600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1344700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1344800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1344900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1345000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1345100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1345200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1345300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1345400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1345500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1345600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1345700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1345800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1345900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1346000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1346100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1346200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1346300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1346400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1346500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1346600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1346700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1346800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1346900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1347000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1347100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1347200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1347300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1347400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1347500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1347600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1347700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1347800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1347900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1348000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1348100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1348200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1348300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1348400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1348500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1348600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1348700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1348800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1348900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1349000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1349100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1349200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1349300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1349400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1349500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1349600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1349700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1349800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1349900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1350000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1350100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1350200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1350300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1350400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1350500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1350600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1350700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1350800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1350900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1351000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1351100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1351200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1351300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1351400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1351500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1351600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1351700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1351800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1351900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1352000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1352100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1352200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1352300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1352400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1352500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1352600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1352700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1352800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1352900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1353000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1353100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1353200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1353300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1353400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1353500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1353600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1353700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1353800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1353900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1354000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1354100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1354200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0399, - "step": 1354300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1354400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0377, - "step": 1354500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1354600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1354700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1354800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1354900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1355000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1355100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1355200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1355300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1355400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1355500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1355600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1355700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1355800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1355900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1356000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1356100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1356200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1356300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1356400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1356500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1356600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1356700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1356800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1356900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1357000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1357100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1357200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1357300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1357400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1357500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1357600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1357700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1357800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1357900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1358000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1358100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1358200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1358300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1358400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1358500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1358600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1358700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1358800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1358900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1359000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1359100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1359200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1359300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1359400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1359500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1359600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1359700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1359800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1359900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1360000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0297088623046875, - "eval_runtime": 4153.2895, - "eval_samples_per_second": 270.803, - "eval_steps_per_second": 16.925, - "step": 1360000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1360100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1360200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1360300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1360400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1360500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1360600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1360700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1360800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1360900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1361000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1361100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1361200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1361300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1361400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1361500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1361600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1361700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1361800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1361900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1362000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1362100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1362200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1362300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0362, - "step": 1362400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1362500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1362600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1362700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1362800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1362900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1363000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1363100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1363200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1363300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1363400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1363500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1363600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1363700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1363800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1363900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1364000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1364100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1364200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1364300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1364400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1364500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1364600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1364700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1364800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1364900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1365000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1365100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1365200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1365300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1365400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1365500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1365600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1365700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1365800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1365900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1366000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1366100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1366200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1366300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1366400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1366500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1366600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1366700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1366800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1366900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1367000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1367100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1367200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1367300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1367400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1367500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1367600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1367700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1367800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1367900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1368000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1368100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1368200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1368300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1368400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1368500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1368600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1368700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1368800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1368900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1369000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1369100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1369200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1369300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1369400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1369500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1369600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1369700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1369800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1369900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1370000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1370100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1370200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1370300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1370400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1370500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1370600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1370700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1370800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1370900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1371000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1371100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1371200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1371300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1371400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1371500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1371600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1371700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1371800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1371900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1372000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1372100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0385, - "step": 1372200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1372300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1372400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1372500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 1372600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1372700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1372800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1372900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1373000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1373100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1373200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1373300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1373400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1373500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1373600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1373700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1373800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1373900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1374000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1374100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1374200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1374300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1374400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1374500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1374600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1374700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1374800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1374900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1375000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1375100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1375200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1375300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1375400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1375500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1375600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1375700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1375800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1375900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1376000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 1376100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1376200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1376300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1376400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1376500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1376600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1376700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1376800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1376900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1377000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1377100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1377200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1377300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1377400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1377500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1377600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1377700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1377800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1377900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1378000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1378100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1378200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1378300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 1378400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1378500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1378600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1378700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1378800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1378900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1379000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1379100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1379200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1379300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1379400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1379500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1379600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1379700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1379800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1379900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1380000 - }, - { - "epoch": 0.0, - "eval_loss": 0.030059814453125, - "eval_runtime": 4082.9937, - "eval_samples_per_second": 275.465, - "eval_steps_per_second": 17.217, - "step": 1380000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1380100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1380200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1380300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1380400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1380500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1380600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1380700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1380800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1380900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1381000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1381100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1381200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1381300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 1381400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1381500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1381600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1381700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1381800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1381900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1382000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1382100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1382200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1382300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1382400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1382500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1382600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1382700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1382800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1382900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1383000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1383100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1383200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1383300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1383400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1383500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1383600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1383700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1383800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1383900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1384000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1384100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1384200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1384300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1384400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1384500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1384600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1384700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1384800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1384900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1385000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1385100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1385200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1385300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1385400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1385500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1385600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1385700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1385800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1385900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1386000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1386100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1386200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1386300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1386400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1386500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1386600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1386700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1386800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1386900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1387000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1387100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1387200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1387300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1387400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1387500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1387600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1387700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1387800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1387900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1388000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1388100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1388200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1388300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1388400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1388500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1388600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1388700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1388800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1388900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1389000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1389100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1389200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1389300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1389400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1389500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1389600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1389700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1389800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1389900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1390000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1390100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1390200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1390300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1390400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1390500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1390600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1390700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1390800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1390900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1391000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1391100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1391200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1391300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1391400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1391500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1391600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1391700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1391800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1391900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1392000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1392100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1392200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1392300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1392400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1392500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1392600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1392700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1392800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1392900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1393000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1393100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1393200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1393300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1393400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1393500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1393600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1393700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1393800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1393900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1394000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1394100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1394200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1394300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1394400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1394500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1394600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1394700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1394800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1394900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1395000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1395100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1395200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1395300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1395400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1395500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1395600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1395700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1395800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1395900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1396000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1396100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1396200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1396300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1396400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1396500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1396600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1396700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1396800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1396900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1397000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1397100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1397200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1397300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1397400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1397500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1397600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1397700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1397800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1397900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1398000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1398100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1398200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1398300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1398400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1398500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1398600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1398700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1398800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1398900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1399000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1399100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1399200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1399300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1399400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1399500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1399600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1399700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1399800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1399900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1400000 - }, - { - "epoch": 0.0, - "eval_loss": 0.029754638671875, - "eval_runtime": 3458.2901, - "eval_samples_per_second": 325.225, - "eval_steps_per_second": 20.327, - "step": 1400000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1400100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1400200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1400300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1400400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1400500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1400600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1400700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1400800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1400900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1401000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1401100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1401200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1401300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1401400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1401500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1401600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1401700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1401800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1401900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1402000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1402100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1402200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1402300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1402400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1402500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1402600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1402700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1402800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1402900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1403000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1403100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1403200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1403300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1403400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1403500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1403600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1403700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1403800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1403900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1404000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1404100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1404200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1404300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1404400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1404500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1404600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1404700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1404800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1404900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1405000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1405100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1405200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1405300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1405400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1405500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1405600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1405700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1405800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1405900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1406000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1406100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1406200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1406300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1406400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1406500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1406600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1406700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1406800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1406900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1407000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1407100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1407200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1407300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1407400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1407500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1407600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1407700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1407800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1407900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1408000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1408100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1408200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1408300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1408400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1408500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1408600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1408700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1408800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1408900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1409000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1409100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1409200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1409300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1409400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1409500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1409600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1409700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1409800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1409900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1410000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1410100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1410200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1410300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1410400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1410500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1410600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1410700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1410800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1410900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1411000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1411100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1411200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1411300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1411400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1411500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1411600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1411700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1411800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1411900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1412000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1412100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1412200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1412300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1412400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1412500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1412600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1412700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1412800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1412900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1413000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1413100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1413200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1413300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1413400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1413500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1413600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1413700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1413800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1413900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1414000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1414100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1414200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1414300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1414400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1414500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1414600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1414700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1414800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1414900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1415000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1415100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1415200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1415300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1415400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1415500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1415600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1415700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1415800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1415900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1416000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1416100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1416200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1416300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1416400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1416500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1416600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1416700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1416800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1416900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1417000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1417100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1417200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1417300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1417400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1417500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1417600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1417700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1417800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1417900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1418000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1418100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1418200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1418300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1418400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1418500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1418600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1418700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1418800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1418900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1419000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1419100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1419200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1419300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1419400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1419500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1419600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1419700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1419800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1419900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1420000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02960205078125, - "eval_runtime": 3232.2323, - "eval_samples_per_second": 347.971, - "eval_steps_per_second": 21.748, - "step": 1420000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1420100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1420200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1420300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1420400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1420500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1420600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1420700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1420800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1420900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1421000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1421100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1421200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1421300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1421400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1421500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1421600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1421700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1421800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1421900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1422000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1422100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1422200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1422300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1422400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1422500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1422600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1422700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1422800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1422900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1423000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1423100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1423200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1423300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1423400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1423500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1423600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1423700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1423800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1423900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1424000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1424100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1424200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1424300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1424400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1424500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1424600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1424700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1424800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1424900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1425000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1425100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1425200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1425300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1425400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1425500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1425600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1425700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1425800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1425900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1426000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1426100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1426200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1426300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1426400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1426500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1426600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1426700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1426800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1426900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1427000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1427100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1427200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1427300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1427400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1427500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1427600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1427700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1427800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1427900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1428000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1428100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1428200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1428300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1428400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1428500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1428600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1428700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1428800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1428900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1429000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1429100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1429200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1429300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1429400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1429500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1429600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1429700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1429800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1429900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1430000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1430100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1430200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1430300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1430400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1430500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1430600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1430700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1430800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1430900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1431000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1431100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1431200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1431300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1431400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1431500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1431600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1431700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1431800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1431900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1432000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1432100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1432200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1432300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1432400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1432500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1432600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1432700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1432800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1432900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1433000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1433100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1433200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1433300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1433400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1433500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1433600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1433700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1433800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1433900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1434000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1434100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1434200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1434300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1434400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1434500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1434600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1434700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1434800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1434900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1435000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1435100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1435200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1435300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1435400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1435500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1435600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1435700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1435800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1435900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1436000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1436100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1436200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1436300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1436400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1436500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1436600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1436700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1436800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1436900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1437000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1437100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1437200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1437300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1437400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1437500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1437600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1437700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1437800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1437900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1438000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1438100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1438200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1438300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1438400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1438500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1438600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1438700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1438800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1438900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1439000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1439100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1439200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1439300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1439400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1439500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1439600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1439700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1439800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1439900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1440000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0299072265625, - "eval_runtime": 3252.8152, - "eval_samples_per_second": 345.769, - "eval_steps_per_second": 21.611, - "step": 1440000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1440100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1440200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1440300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1440400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1440500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1440600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1440700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1440800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1440900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1441000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1441100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1441200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1441300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1441400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1441500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1441600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1441700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1441800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1441900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1442000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1442100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1442200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1442300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1442400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1442500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1442600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1442700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1442800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1442900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1443000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1443100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1443200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1443300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1443400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1443500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1443600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1443700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1443800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1443900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1444000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1444100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1444200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1444300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1444400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1444500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1444600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1444700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1444800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1444900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1445000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1445100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1445200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1445300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1445400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1445500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1445600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1445700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1445800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1445900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1446000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1446100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1446200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1446300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1446400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1446500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1446600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1446700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1446800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1446900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1447000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1447100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1447200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1447300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1447400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1447500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1447600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1447700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1447800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1447900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1448000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1448100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1448200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1448300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1448400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1448500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1448600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1448700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1448800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1448900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1449000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1449100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1449200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1449300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1449400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1449500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1449600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1449700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1449800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1449900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1450000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1450100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1450200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1450300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1450400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1450500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1450600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1450700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1450800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1450900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1451000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1451100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1451200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1451300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1451400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1451500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1451600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1451700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1451800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1451900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1452000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1452100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1452200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1452300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1452400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1452500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1452600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1452700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1452800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1452900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1453000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1453100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1453200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1453300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1453400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1453500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1453600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1453700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1453800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1453900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1454000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1454100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1454200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1454300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1454400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1454500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1454600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1454700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1454800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1454900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1455000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1455100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1455200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1455300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1455400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1455500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1455600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1455700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1455800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1455900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1456000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1456100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1456200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1456300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1456400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1456500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1456600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1456700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1456800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1456900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1457000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1457100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1457200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1457300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1457400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1457500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1457600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1457700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1457800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1457900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1458000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1458100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1458200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1458300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1458400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1458500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1458600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1458700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1458800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1458900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1459000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1459100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1459200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1459300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1459400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1459500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1459600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1459700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1459800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1459900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1460000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0298309326171875, - "eval_runtime": 3154.1525, - "eval_samples_per_second": 356.585, - "eval_steps_per_second": 22.287, - "step": 1460000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1460100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1460200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1460300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1460400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1460500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1460600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1460700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1460800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1460900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1461000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1461100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1461200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1461300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1461400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1461500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1461600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1461700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1461800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1461900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1462000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1462100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1462200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1462300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1462400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1462500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1462600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1462700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1462800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1462900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1463000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1463100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1463200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1463300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1463400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1463500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1463600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1463700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1463800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1463900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1464000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1464100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1464200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1464300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1464400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1464500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1464600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1464700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1464800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1464900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1465000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1465100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1465200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1465300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1465400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1465500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 1465600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1465700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1465800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1465900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1466000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1466100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1466200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1466300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1466400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1466500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1466600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1466700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1466800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1466900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1467000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1467100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1467200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1467300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1467400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1467500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1467600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1467700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1467800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1467900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1468000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1468100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1468200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1468300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1468400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1468500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1468600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1468700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1468800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1468900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1469000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1469100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1469200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1469300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1469400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1469500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1469600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1469700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1469800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1469900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1470000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1470100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1470200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1470300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1470400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1470500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1470600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1470700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1470800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1470900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1471000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1471100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1471200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1471300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1471400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1471500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1471600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1471700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1471800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1471900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1472000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1472100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1472200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1472300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1472400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1472500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1472600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1472700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1472800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1472900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1473000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1473100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1473200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1473300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1473400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1473500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1473600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1473700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1473800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1473900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1474000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1474100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1474200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1474300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1474400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1474500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1474600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1474700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1474800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1474900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1475000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1475100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1475200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1475300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1475400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1475500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1475600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1475700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1475800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1475900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1476000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1476100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1476200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1476300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1476400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1476500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1476600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1476700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1476800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1476900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1477000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1477100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1477200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1477300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1477400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1477500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1477600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1477700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1477800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1477900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1478000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1478100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1478200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1478300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1478400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1478500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1478600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1478700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1478800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1478900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1479000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1479100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1479200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1479300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1479400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1479500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1479600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1479700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1479800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1479900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1480000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0298309326171875, - "eval_runtime": 3118.3877, - "eval_samples_per_second": 360.675, - "eval_steps_per_second": 22.542, - "step": 1480000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1480100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1480200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1480300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1480400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1480500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 1480600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1480700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1480800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1480900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1481000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1481100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1481200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1481300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1481400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1481500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1481600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1481700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1481800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1481900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1482000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1482100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1482200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1482300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1482400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1482500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1482600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1482700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1482800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1482900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1483000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1483100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1483200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1483300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1483400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1483500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1483600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1483700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1483800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1483900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1484000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1484100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1484200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1484300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1484400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1484500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1484600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1484700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1484800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1484900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1485000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1485100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1485200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1485300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1485400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1485500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1485600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1485700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1485800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1485900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1486000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1486100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1486200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1486300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1486400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1486500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1486600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1486700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1486800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1486900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1487000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1487100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1487200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1487300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1487400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1487500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1487600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1487700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1487800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1487900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1488000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1488100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1488200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1488300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1488400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1488500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1488600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1488700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1488800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1488900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1489000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1489100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1489200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1489300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1489400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1489500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1489600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1489700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1489800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1489900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1490000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1490100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1490200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1490300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1490400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1490500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1490600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1490700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1490800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1490900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1491000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1491100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1491200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1491300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1491400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1491500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1491600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1491700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1491800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1491900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1492000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1492100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1492200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1492300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1492400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1492500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1492600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1492700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1492800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1492900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1493000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1493100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1493200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1493300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1493400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1493500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1493600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1493700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1493800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1493900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1494000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1494100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1494200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1494300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1494400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1494500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1494600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1494700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1494800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1494900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1495000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1495100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1495200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1495300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1495400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1495500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1495600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1495700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1495800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1495900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1496000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1496100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1496200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1496300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1496400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1496500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1496600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1496700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1496800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1496900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1497000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1497100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1497200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1497300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1497400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1497500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1497600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1497700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1497800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1497900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1498000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1498100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1498200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1498300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1498400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1498500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1498600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1498700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1498800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1498900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1499000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1499100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1499200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1499300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1499400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1499500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1499600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1499700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1499800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1499900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1500000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0298919677734375, - "eval_runtime": 3292.6728, - "eval_samples_per_second": 341.584, - "eval_steps_per_second": 21.349, - "step": 1500000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1500100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1500200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1500300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1500400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1500500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1500600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1500700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1500800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1500900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1501000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1501100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1501200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1501300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1501400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1501500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1501600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1501700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1501800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1501900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1502000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1502100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1502200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1502300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1502400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1502500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1502600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1502700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1502800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1502900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1503000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1503100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1503200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1503300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1503400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1503500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1503600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1503700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1503800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1503900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1504000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1504100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1504200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1504300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1504400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1504500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1504600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1504700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1504800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1504900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1505000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1505100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1505200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1505300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1505400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1505500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1505600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1505700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1505800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1505900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1506000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1506100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1506200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1506300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1506400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1506500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1506600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1506700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1506800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1506900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1507000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1507100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1507200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1507300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1507400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1507500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1507600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1507700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1507800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1507900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1508000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1508100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1508200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1508300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1508400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1508500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1508600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1508700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1508800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1508900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1509000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1509100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1509200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1509300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1509400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1509500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1509600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1509700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1509800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1509900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1510000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1510100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1510200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1510300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1510400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1510500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1510600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1510700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1510800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1510900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1511000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1511100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1511200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1511300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1511400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1511500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1511600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1511700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1511800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1511900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1512000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1512100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1512200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1512300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1512400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1512500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1512600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1512700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1512800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1512900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1513000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1513100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1513200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1513300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1513400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1513500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1513600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1513700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1513800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1513900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1514000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1514100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1514200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1514300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1514400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1514500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1514600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1514700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1514800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1514900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1515000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1515100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1515200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1515300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1515400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1515500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1515600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1515700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1515800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1515900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1516000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1516100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1516200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1516300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1516400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1516500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1516600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1516700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1516800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1516900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1517000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1517100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1517200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1517300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1517400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1517500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1517600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1517700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1517800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1517900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1518000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1518100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1518200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1518300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1518400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1518500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1518600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1518700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1518800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1518900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1519000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1519100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1519200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1519300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1519400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1519500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1519600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1519700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1519800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1519900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1520000 - }, - { - "epoch": 0.0, - "eval_loss": 0.029541015625, - "eval_runtime": 3474.5692, - "eval_samples_per_second": 323.701, - "eval_steps_per_second": 20.232, - "step": 1520000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1520100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1520200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1520300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1520400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1520500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1520600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1520700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1520800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1520900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1521000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1521100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1521200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1521300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1521400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1521500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1521600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1521700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1521800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1521900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1522000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1522100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1522200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1522300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1522400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1522500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1522600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1522700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1522800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1522900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1523000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1523100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1523200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1523300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1523400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1523500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1523600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1523700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1523800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1523900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1524000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1524100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1524200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1524300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1524400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1524500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1524600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1524700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1524800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1524900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1525000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1525100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1525200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1525300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1525400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1525500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1525600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1525700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1525800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1525900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1526000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1526100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1526200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1526300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1526400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1526500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1526600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1526700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1526800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1526900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1527000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1527100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1527200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1527300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1527400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1527500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1527600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1527700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1527800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1527900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1528000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1528100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1528200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1528300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1528400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1528500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1528600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1528700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1528800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1528900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1529000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1529100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1529200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1529300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1529400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1529500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1529600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1529700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1529800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1529900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1530000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1530100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1530200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1530300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1530400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1530500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1530600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1530700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1530800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1530900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1531000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1531100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1531200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1531300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1531400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1531500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1531600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1531700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1531800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1531900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1532000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1532100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1532200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1532300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1532400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1532500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1532600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1532700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1532800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1532900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1533000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1533100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1533200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1533300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1533400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1533500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1533600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1533700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1533800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1533900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1534000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1534100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1534200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1534300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1534400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1534500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1534600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1534700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1534800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1534900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1535000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1535100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1535200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1535300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1535400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1535500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1535600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1535700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1535800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1535900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1536000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1536100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1536200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1536300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1536400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1536500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1536600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1536700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1536800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1536900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1537000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1537100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1537200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1537300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1537400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1537500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1537600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1537700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1537800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1537900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1538000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1538100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1538200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1538300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1538400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1538500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1538600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1538700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1538800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1538900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1539000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1539100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1539200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1539300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1539400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1539500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1539600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1539700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1539800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1539900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1540000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0293731689453125, - "eval_runtime": 3714.1825, - "eval_samples_per_second": 302.818, - "eval_steps_per_second": 18.926, - "step": 1540000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1540100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1540200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1540300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1540400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1540500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1540600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1540700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1540800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1540900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1541000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1541100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1541200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1541300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1541400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1541500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1541600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1541700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1541800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1541900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1542000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1542100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1542200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1542300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1542400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1542500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1542600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1542700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1542800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1542900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1543000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1543100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1543200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1543300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1543400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1543500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1543600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1543700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1543800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1543900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1544000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1544100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1544200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1544300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1544400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1544500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1544600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1544700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1544800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1544900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1545000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1545100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1545200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1545300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1545400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1545500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1545600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1545700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1545800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1545900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1546000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1546100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1546200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1546300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1546400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1546500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1546600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1546700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1546800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1546900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1547000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1547100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1547200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1547300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1547400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1547500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1547600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1547700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1547800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1547900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1548000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1548100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1548200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1548300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1548400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1548500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 1548600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1548700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1548800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1548900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1549000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1549100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1549200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1549300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1549400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1549500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1549600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1549700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1549800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1549900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1550000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1550100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1550200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1550300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1550400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1550500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1550600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1550700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1550800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1550900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1551000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1551100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1551200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1551300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1551400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1551500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1551600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1551700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1551800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1551900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1552000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1552100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1552200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1552300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1552400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1552500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1552600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1552700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1552800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1552900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1553000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1553100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1553200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1553300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1553400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1553500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1553600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1553700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1553800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1553900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1554000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1554100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 1554200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1554300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1554400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1554500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1554600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1554700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1554800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1554900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1555000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1555100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1555200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1555300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1555400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1555500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1555600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1555700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1555800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1555900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1556000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1556100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1556200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1556300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1556400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1556500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1556600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1556700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1556800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1556900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1557000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1557100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1557200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1557300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1557400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1557500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1557600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1557700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1557800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1557900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1558000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1558100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1558200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1558300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1558400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1558500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1558600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1558700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1558800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1558900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1559000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1559100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1559200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1559300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1559400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1559500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1559600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1559700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1559800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1559900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1560000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0294952392578125, - "eval_runtime": 4014.0505, - "eval_samples_per_second": 280.197, - "eval_steps_per_second": 17.512, - "step": 1560000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1560100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1560200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1560300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1560400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1560500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1560600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1560700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1560800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1560900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1561000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1561100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1561200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1561300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1561400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1561500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1561600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1561700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1561800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1561900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1562000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1562100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1562200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1562300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1562400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1562500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1562600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1562700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1562800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1562900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1563000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1563100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1563200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1563300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1563400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1563500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1563600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1563700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1563800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1563900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1564000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1564100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1564200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1564300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1564400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1564500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1564600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1564700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1564800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1564900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1565000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1565100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1565200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1565300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1565400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1565500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1565600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1565700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1565800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1565900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1566000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1566100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1566200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1566300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 1566400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1566500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1566600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1566700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1566800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1566900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1567000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1567100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1567200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1567300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1567400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1567500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1567600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1567700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1567800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1567900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1568000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1568100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1568200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1568300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1568400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1568500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1568600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1568700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1568800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1568900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1569000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1569100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1569200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1569300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1569400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1569500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1569600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 1569700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1569800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1569900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1570000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1570100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1570200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1570300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1570400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1570500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1570600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1570700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1570800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1570900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1571000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1571100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1571200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1571300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1571400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1571500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1571600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1571700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1571800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1571900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1572000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1572100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1572200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1572300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1572400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1572500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1572600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1572700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1572800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1572900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1573000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1573100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1573200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1573300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1573400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1573500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1573600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1573700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1573800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1573900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1574000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1574100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1574200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1574300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1574400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1574500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1574600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1574700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1574800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1574900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1575000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1575100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1575200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1575300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1575400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1575500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1575600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1575700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1575800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1575900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1576000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1576100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1576200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1576300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1576400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1576500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1576600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1576700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1576800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1576900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1577000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1577100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1577200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1577300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1577400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1577500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1577600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1577700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1577800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1577900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1578000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1578100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1578200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1578300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1578400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1578500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1578600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1578700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1578800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1578900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1579000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1579100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1579200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1579300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1579400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1579500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1579600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1579700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1579800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1579900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1580000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0294647216796875, - "eval_runtime": 4066.6786, - "eval_samples_per_second": 276.57, - "eval_steps_per_second": 17.286, - "step": 1580000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1580100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1580200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1580300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1580400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 1580500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1580600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1580700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1580800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1580900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1581000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1581100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1581200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1581300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1581400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1581500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1581600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1581700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1581800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1581900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1582000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1582100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1582200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1582300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1582400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1582500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1582600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1582700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1582800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1582900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1583000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1583100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1583200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1583300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1583400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1583500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1583600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1583700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1583800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1583900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1584000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1584100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1584200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1584300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1584400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1584500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1584600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1584700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1584800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1584900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1585000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1585100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1585200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1585300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 1585400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1585500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1585600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1585700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1585800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1585900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1586000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1586100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1586200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1586300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1586400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1586500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1586600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1586700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1586800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1586900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1587000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1587100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1587200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1587300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1587400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1587500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1587600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1587700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1587800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1587900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1588000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1588100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1588200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1588300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1588400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1588500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1588600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1588700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1588800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1588900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1589000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1589100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1589200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1589300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1589400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1589500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1589600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1589700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1589800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1589900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1590000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1590100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1590200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1590300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1590400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1590500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1590600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1590700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1590800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1590900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1591000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1591100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1591200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1591300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1591400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1591500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1591600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1591700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1591800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1591900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1592000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1592100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1592200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1592300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1592400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1592500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1592600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1592700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1592800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1592900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1593000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1593100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1593200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1593300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1593400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1593500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1593600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1593700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1593800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1593900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1594000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1594100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1594200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1594300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1594400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1594500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1594600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1594700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1594800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1594900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1595000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1595100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1595200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1595300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1595400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1595500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1595600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1595700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1595800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1595900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1596000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1596100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1596200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1596300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1596400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1596500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1596600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1596700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1596800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1596900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1597000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1597100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1597200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1597300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1597400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1597500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1597600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1597700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1597800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1597900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1598000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1598100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1598200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1598300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1598400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1598500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1598600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1598700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1598800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1598900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1599000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1599100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1599200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1599300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1599400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1599500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1599600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1599700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1599800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1599900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1600000 - }, - { - "epoch": 0.0, - "eval_loss": 0.029632568359375, - "eval_runtime": 3871.4209, - "eval_samples_per_second": 290.519, - "eval_steps_per_second": 18.158, - "step": 1600000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1600100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1600200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1600300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1600400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1600500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1600600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1600700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1600800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1600900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1601000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1601100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1601200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1601300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1601400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1601500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1601600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1601700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1601800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1601900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1602000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1602100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1602200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1602300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1602400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1602500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1602600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1602700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1602800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1602900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1603000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1603100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1603200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1603300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1603400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1603500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1603600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1603700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1603800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1603900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1604000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1604100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1604200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1604300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1604400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1604500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1604600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1604700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1604800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1604900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1605000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1605100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1605200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1605300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1605400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1605500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1605600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1605700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1605800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1605900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1606000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1606100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1606200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1606300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1606400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1606500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1606600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0473, - "step": 1606700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1606800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 1606900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1607000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0396, - "step": 1607100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1607200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1607300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1607400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1607500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1607600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1607700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1607800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1607900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1608000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1608100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1608200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1608300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1608400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1608500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1608600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1608700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1608800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1608900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1609000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1609100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1609200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1609300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1609400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1609500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1609600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1609700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1609800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1609900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1610000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1610100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1610200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1610300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1610400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1610500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1610600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1610700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1610800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1610900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1611000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1611100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1611200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1611300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1611400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1611500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1611600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1611700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1611800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1611900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1612000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1612100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1612200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1612300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1612400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1612500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1612600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1612700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1612800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1612900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1613000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1613100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1613200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0468, - "step": 1613300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1613400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1613500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1613600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1613700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1613800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1613900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1614000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1614100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1614200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1614300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1614400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1614500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1614600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1614700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1614800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1614900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1615000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1615100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1615200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1615300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1615400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1615500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1615600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1615700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1615800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1615900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1616000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1616100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1616200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1616300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1616400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1616500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1616600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1616700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1616800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1616900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1617000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1617100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1617200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1617300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1617400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1617500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1617600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1617700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1617800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1617900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1618000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1618100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1618200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1618300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1618400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1618500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1618600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1618700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1618800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1618900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1619000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1619100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1619200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1619300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1619400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1619500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1619600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1619700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1619800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1619900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1620000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0294647216796875, - "eval_runtime": 4096.2178, - "eval_samples_per_second": 274.576, - "eval_steps_per_second": 17.161, - "step": 1620000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1620100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1620200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1620300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1620400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1620500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1620600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1620700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1620800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1620900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1621000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1621100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1621200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1621300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1621400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1621500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1621600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1621700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1621800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1621900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1622000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1622100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1622200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1622300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1622400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1622500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1622600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1622700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1622800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1622900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1623000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1623100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1623200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1623300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1623400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1623500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1623600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1623700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1623800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1623900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1624000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1624100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1624200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1624300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1624400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1624500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1624600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1624700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1624800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1624900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1625000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1625100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1625200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1625300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1625400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1625500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1625600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1625700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1625800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1625900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1626000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1626100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1626200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1626300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1626400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1626500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1626600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1626700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1626800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1626900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1627000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1627100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1627200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1627300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1627400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1627500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1627600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1627700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1627800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1627900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1628000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1628100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1628200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1628300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1628400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1628500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1628600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1628700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1628800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1628900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1629000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1629100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1629200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1629300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1629400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1629500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1629600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1629700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1629800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1629900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1630000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1630100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1630200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1630300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1630400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1630500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1630600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1630700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1630800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1630900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1631000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1631100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1631200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1631300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1631400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1631500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1631600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1631700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1631800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1631900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1632000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1632100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1632200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1632300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1632400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1632500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1632600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1632700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1632800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1632900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1633000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1633100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1633200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1633300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1633400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1633500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1633600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1633700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1633800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1633900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1634000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1634100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1634200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1634300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1634400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1634500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1634600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1634700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1634800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1634900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1635000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1635100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1635200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1635300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1635400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1635500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1635600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1635700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1635800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1635900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1636000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1636100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1636200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1636300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1636400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1636500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1636600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1636700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1636800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1636900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1637000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1637100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1637200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1637300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1637400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1637500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1637600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1637700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1637800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1637900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1638000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1638100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1638200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1638300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1638400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1638500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1638600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1638700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1638800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1638900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1639000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1639100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1639200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1639300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1639400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1639500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1639600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1639700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1639800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1639900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1640000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0295257568359375, - "eval_runtime": 4142.3589, - "eval_samples_per_second": 271.518, - "eval_steps_per_second": 16.97, - "step": 1640000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1640100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1640200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1640300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1640400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1640500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1640600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1640700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1640800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1640900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1641000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1641100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1641200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1641300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1641400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1641500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1641600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1641700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1641800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1641900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1642000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1642100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1642200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1642300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1642400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1642500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1642600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1642700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1642800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1642900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1643000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1643100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1643200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1643300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1643400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1643500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1643600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1643700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1643800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1643900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1644000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1644100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1644200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1644300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1644400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1644500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1644600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1644700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1644800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1644900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1645000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1645100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1645200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1645300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1645400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1645500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1645600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1645700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1645800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1645900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1646000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1646100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1646200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1646300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1646400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1646500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1646600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1646700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1646800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1646900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1647000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1647100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1647200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1647300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1647400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1647500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1647600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1647700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1647800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1647900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1648000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1648100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1648200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1648300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1648400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1648500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1648600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1648700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1648800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1648900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1649000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1649100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1649200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1649300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1649400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1649500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1649600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1649700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1649800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1649900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1650000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1650100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1650200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1650300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1650400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1650500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1650600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1650700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1650800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1650900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1651000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1651100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1651200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1651300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1651400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1651500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1651600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1651700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1651800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1651900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1652000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1652100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1652200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1652300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1652400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1652500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1652600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1652700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1652800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1652900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1653000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1653100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1653200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1653300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1653400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1653500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1653600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1653700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1653800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1653900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1654000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1654100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1654200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1654300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1654400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1654500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1654600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1654700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1654800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1654900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1655000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1655100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1655200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1655300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1655400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1655500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1655600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1655700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1655800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1655900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1656000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1656100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1656200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1656300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1656400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1656500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1656600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1656700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1656800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 1656900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1657000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1657100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1657200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1657300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1657400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1657500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1657600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1657700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1657800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1657900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1658000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1658100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1658200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1658300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1658400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1658500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1658600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1658700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1658800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1658900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1659000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1659100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1659200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1659300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1659400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1659500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1659600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1659700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1659800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1659900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1660000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0294342041015625, - "eval_runtime": 3159.0731, - "eval_samples_per_second": 356.029, - "eval_steps_per_second": 22.252, - "step": 1660000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1660100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1660200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1660300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1660400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1660500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1660600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1660700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1660800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1660900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1661000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1661100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1661200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1661300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1661400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1661500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1661600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1661700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1661800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1661900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1662000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1662100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1662200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1662300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1662400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1662500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1662600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1662700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1662800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1662900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1663000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1663100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1663200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1663300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1663400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1663500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1663600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1663700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1663800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1663900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1664000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1664100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1664200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1664300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1664400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1664500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1664600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1664700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1664800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1664900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1665000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1665100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1665200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1665300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1665400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1665500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1665600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1665700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1665800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1665900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1666000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0538, - "step": 1666100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1666200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1666300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1666400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1666500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1666600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1666700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1666800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1666900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1667000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1667100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1667200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1667300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1667400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1667500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1667600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1667700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1667800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1667900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1668000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1668100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1668200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1668300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1668400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1668500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1668600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1668700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1668800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1668900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1669000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1669100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1669200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1669300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1669400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1669500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1669600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1669700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1669800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1669900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1670000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1670100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1670200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1670300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1670400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1670500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1670600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1670700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1670800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1670900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1671000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1671100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1671200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1671300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1671400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1671500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1671600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1671700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1671800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1671900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1672000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1672100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1672200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1672300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1672400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1672500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1672600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1672700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1672800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1672900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1673000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1673100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1673200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1673300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1673400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1673500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1673600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1673700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1673800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1673900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1674000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1674100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1674200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1674300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1674400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1674500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1674600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1674700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1674800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1674900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1675000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1675100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1675200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1675300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1675400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 1675500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1675600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1675700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1675800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1675900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1676000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1676100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1676200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1676300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1676400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1676500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1676600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1676700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1676800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1676900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1677000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1677100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1677200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1677300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1677400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1677500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1677600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1677700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1677800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1677900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1678000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1678100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1678200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1678300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1678400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1678500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1678600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1678700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1678800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1678900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1679000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1679100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1679200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1679300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1679400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1679500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1679600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1679700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1679800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1679900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1680000 - }, - { - "epoch": 0.0, - "eval_loss": 0.029327392578125, - "eval_runtime": 3073.5891, - "eval_samples_per_second": 365.931, - "eval_steps_per_second": 22.871, - "step": 1680000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1680100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1680200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1680300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1680400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1680500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1680600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1680700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1680800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1680900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1681000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1681100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1681200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1681300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1681400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1681500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1681600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1681700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1681800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1681900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1682000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1682100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1682200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1682300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1682400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1682500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1682600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1682700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1682800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1682900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1683000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1683100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1683200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1683300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1683400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1683500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1683600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1683700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1683800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1683900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1684000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1684100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1684200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1684300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1684400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1684500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1684600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1684700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1684800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1684900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1685000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1685100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1685200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1685300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1685400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1685500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1685600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1685700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1685800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1685900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1686000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1686100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1686200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1686300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1686400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1686500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1686600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1686700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1686800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1686900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1687000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1687100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1687200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1687300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1687400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1687500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1687600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1687700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1687800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1687900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1688000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1688100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1688200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1688300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1688400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1688500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1688600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1688700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1688800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1688900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1689000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1689100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1689200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1689300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1689400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1689500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1689600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1689700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1689800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1689900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1690000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1690100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1690200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1690300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1690400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1690500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1690600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1690700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1690800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1690900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1691000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1691100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1691200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1691300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1691400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1691500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1691600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1691700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1691800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1691900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1692000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1692100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 1692200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1692300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1692400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1692500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1692600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1692700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1692800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1692900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1693000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1693100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1693200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1693300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1693400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1693500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1693600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1693700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1693800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1693900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1694000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1694100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1694200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1694300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1694400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1694500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1694600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1694700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1694800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1694900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1695000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1695100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1695200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1695300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1695400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1695500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1695600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1695700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1695800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1695900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1696000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1696100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1696200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1696300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1696400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1696500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1696600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1696700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1696800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1696900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1697000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1697100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1697200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1697300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1697400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1697500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1697600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1697700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1697800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1697900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1698000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1698100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1698200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1698300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1698400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1698500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1698600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1698700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1698800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1698900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1699000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1699100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1699200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1699300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1699400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1699500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1699600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1699700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1699800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1699900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1700000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0290679931640625, - "eval_runtime": 3020.7946, - "eval_samples_per_second": 372.327, - "eval_steps_per_second": 23.271, - "step": 1700000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1700100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1700200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1700300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1700400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1700500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1700600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1700700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1700800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1700900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1701000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1701100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1701200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1701300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1701400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1701500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1701600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1701700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1701800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1701900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1702000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1702100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1702200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1702300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1702400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1702500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1702600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1702700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1702800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1702900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1703000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1703100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1703200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1703300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1703400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1703500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1703600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1703700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1703800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1703900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1704000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1704100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1704200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1704300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1704400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1704500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1704600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1704700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1704800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1704900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1705000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1705100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1705200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1705300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1705400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1705500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1705600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1705700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1705800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1705900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1706000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1706100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1706200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1706300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1706400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1706500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1706600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1706700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1706800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1706900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1707000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1707100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1707200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1707300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1707400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1707500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1707600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1707700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1707800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1707900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1708000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1708100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1708200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1708300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1708400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1708500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1708600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1708700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1708800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1708900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1709000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1709100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1709200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1709300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1709400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1709500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1709600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1709700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1709800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1709900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1710000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1710100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1710200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1710300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1710400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1710500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1710600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1710700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1710800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1710900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1711000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1711100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1711200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1711300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1711400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1711500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1711600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1711700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1711800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1711900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1712000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1712100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1712200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1712300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1712400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1712500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 1712600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1712700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1712800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1712900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1713000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1713100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1713200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1713300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1713400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1713500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1713600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1713700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1713800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1713900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1714000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1714100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1714200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1714300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1714400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1714500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1714600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1714700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1714800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1714900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1715000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1715100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1715200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1715300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1715400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1715500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1715600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1715700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1715800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1715900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1716000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1716100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1716200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1716300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1716400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1716500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1716600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1716700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1716800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1716900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1717000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1717100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1717200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1717300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1717400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1717500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1717600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1717700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1717800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1717900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1718000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1718100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1718200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1718300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1718400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1718500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1718600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1718700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1718800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1718900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1719000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1719100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1719200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1719300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1719400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1719500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1719600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1719700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1719800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1719900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1720000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0290069580078125, - "eval_runtime": 3004.4195, - "eval_samples_per_second": 374.356, - "eval_steps_per_second": 23.398, - "step": 1720000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1720100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1720200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1720300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1720400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1720500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1720600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1720700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1720800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1720900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1721000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1721100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1721200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1721300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1721400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1721500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1721600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1721700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1721800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1721900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1722000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1722100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1722200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1722300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1722400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1722500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1722600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1722700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1722800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1722900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1723000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1723100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1723200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1723300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1723400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1723500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1723600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1723700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1723800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1723900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1724000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1724100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1724200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1724300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1724400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1724500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1724600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1724700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1724800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1724900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1725000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1725100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1725200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1725300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1725400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1725500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1725600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1725700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1725800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1725900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1726000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1726100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1726200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1726300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1726400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1726500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1726600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1726700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1726800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1726900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1727000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1727100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1727200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1727300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1727400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1727500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1727600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1727700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1727800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1727900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 1728000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1728100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1728200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1728300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1728400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1728500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1728600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1728700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1728800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1728900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1729000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1729100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1729200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1729300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1729400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1729500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1729600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1729700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1729800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1729900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1730000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1730100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1730200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1730300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1730400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1730500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1730600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1730700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1730800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1730900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1731000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1731100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1731200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1731300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1731400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1731500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1731600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1731700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1731800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1731900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1732000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1732100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1732200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1732300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1732400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1732500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1732600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1732700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1732800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1732900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1733000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1733100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1733200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1733300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1733400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1733500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1733600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1733700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1733800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1733900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1734000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1734100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1734200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1734300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1734400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1734500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1734600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1734700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1734800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1734900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1735000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1735100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1735200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1735300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1735400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1735500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1735600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1735700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1735800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1735900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1736000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1736100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1736200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1736300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1736400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1736500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1736600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1736700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1736800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1736900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1737000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1737100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1737200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1737300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1737400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1737500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1737600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1737700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1737800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1737900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1738000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1738100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1738200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1738300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1738400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1738500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1738600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1738700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1738800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1738900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1739000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1739100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1739200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1739300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1739400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1739500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1739600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1739700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1739800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1739900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1740000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0287322998046875, - "eval_runtime": 3010.2667, - "eval_samples_per_second": 373.629, - "eval_steps_per_second": 23.352, - "step": 1740000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1740100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1740200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1740300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1740400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 1740500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1740600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1740700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1740800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1740900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1741000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1741100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1741200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1741300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1741400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1741500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1741600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1741700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1741800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1741900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1742000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1742100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1742200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1742300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 1742400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1742500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1742600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1742700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1742800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1742900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1743000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1743100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1743200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1743300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1743400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1743500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1743600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1743700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1743800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1743900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1744000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1744100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1744200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1744300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1744400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1744500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1744600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1744700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1744800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1744900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1745000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1745100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1745200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1745300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1745400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1745500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1745600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1745700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1745800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1745900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1746000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1746100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1746200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1746300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1746400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1746500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1746600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1746700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1746800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1746900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1747000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1747100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1747200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1747300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1747400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1747500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1747600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1747700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1747800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1747900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1748000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1748100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1748200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1748300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1748400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1748500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1748600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1748700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1748800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1748900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1749000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1749100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1749200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1749300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1749400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1749500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1749600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1749700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1749800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1749900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1750000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1750100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1750200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1750300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1750400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1750500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1750600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1750700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1750800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1750900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1751000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1751100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1751200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1751300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1751400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1751500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1751600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1751700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1751800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1751900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1752000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1752100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1752200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1752300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1752400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1752500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1752600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1752700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1752800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1752900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1753000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1753100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1753200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1753300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1753400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 1753500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1753600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1753700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1753800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1753900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1754000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1754100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1754200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1754300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1754400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1754500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1754600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1754700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1754800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1754900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1755000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1755100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1755200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1755300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1755400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1755500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1755600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1755700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1755800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1755900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1756000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1756100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1756200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1756300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1756400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1756500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1756600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1756700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1756800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1756900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1757000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1757100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1757200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1757300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1757400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1757500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1757600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1757700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1757800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1757900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 1758000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1758100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1758200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1758300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1758400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1758500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1758600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1758700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1758800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1758900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1759000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 1759100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1759200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 1759300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1759400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1759500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1759600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1759700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1759800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1759900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1760000 - }, - { - "epoch": 0.0, - "eval_loss": 0.029693603515625, - "eval_runtime": 3108.7118, - "eval_samples_per_second": 361.797, - "eval_steps_per_second": 22.613, - "step": 1760000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1760100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1760200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1760300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1760400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1760500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1760600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1760700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1760800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1760900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1761000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1761100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1761200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1761300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1761400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1761500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1761600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1761700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1761800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1761900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.044, - "step": 1762000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1762100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1762200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1762300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1762400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1762500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1762600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1762700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1762800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1762900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1763000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1763100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1763200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1763300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1763400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1763500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1763600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1763700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1763800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1763900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1764000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1764100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1764200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1764300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1764400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1764500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1764600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1764700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1764800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1764900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1765000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1765100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1765200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1765300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1765400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1765500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1765600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1765700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1765800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1765900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1766000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1766100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1766200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1766300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1766400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1766500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1766600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0544, - "step": 1766700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1766800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1766900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1767000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1767100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1767200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1767300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1767400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1767500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1767600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1767700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1767800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1767900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1768000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1768100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1768200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1768300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1768400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1768500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1768600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1768700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1768800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1768900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1769000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1769100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1769200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1769300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1769400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1769500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1769600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1769700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1769800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1769900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1770000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1770100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1770200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1770300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1770400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1770500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1770600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1770700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1770800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1770900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1771000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1771100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1771200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1771300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1771400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1771500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1771600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1771700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1771800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1771900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1772000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1772100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1772200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1772300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1772400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1772500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1772600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1772700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1772800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1772900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1773000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1773100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1773200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1773300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1773400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1773500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1773600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1773700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1773800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1773900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 1774000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1774100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1774200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1774300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1774400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1774500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1774600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1774700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1774800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1774900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1775000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1775100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1775200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1775300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1775400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1775500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1775600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1775700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1775800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1775900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1776000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1776100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1776200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1776300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1776400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1776500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1776600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1776700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1776800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1776900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1777000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1777100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1777200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1777300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1777400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1777500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1777600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1777700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1777800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1777900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1778000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1778100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1778200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1778300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1778400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1778500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1778600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1778700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1778800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1778900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1779000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1779100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1779200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1779300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1779400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1779500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1779600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1779700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1779800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1779900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1780000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028839111328125, - "eval_runtime": 3155.7136, - "eval_samples_per_second": 356.408, - "eval_steps_per_second": 22.276, - "step": 1780000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1780100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1780200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1780300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1780400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1780500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1780600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1780700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1780800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1780900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1781000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1781100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1781200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1781300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1781400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1781500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1781600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1781700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1781800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1781900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1782000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1782100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1782200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1782300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1782400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1782500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1782600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1782700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1782800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1782900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1783000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1783100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1783200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1783300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1783400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1783500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1783600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1783700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1783800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1783900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1784000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1784100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1784200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1784300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1784400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1784500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1784600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1784700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1784800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1784900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1785000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1785100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1785200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 1785300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1785400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1785500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1785600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1785700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1785800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1785900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1786000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1786100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1786200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1786300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1786400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1786500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1786600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1786700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1786800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1786900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 1787000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1787100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1787200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1787300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1787400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1787500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1787600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1787700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1787800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1787900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1788000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1788100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1788200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1788300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1788400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1788500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1788600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1788700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1788800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1788900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1789000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1789100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1789200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1789300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1789400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1789500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1789600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1789700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1789800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1789900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1790000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1790100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1790200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1790300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1790400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1790500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1790600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1790700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1790800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1790900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1791000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1791100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1791200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1791300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1791400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1791500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1791600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1791700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1791800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 1791900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1792000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1792100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1792200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1792300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1792400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1792500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1792600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1792700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1792800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1792900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1793000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1793100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1793200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1793300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1793400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1793500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1793600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1793700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1793800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1793900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1794000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1794100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1794200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1794300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1794400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1794500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1794600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1794700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1794800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1794900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1795000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1795100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1795200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1795300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1795400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1795500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1795600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1795700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1795800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1795900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1796000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1796100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1796200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1796300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1796400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1796500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1796600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1796700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1796800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1796900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1797000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1797100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1797200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1797300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1797400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1797500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1797600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1797700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1797800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1797900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1798000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1798100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1798200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1798300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1798400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1798500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1798600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1798700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1798800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1798900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1799000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1799100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1799200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1799300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1799400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1799500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1799600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1799700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1799800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1799900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1800000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0287017822265625, - "eval_runtime": 3102.5431, - "eval_samples_per_second": 362.516, - "eval_steps_per_second": 22.658, - "step": 1800000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1800100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1800200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1800300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1800400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1800500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1800600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1800700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1800800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1800900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1801000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1801100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1801200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1801300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1801400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1801500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1801600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1801700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1801800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1801900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1802000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1802100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1802200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1802300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1802400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1802500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1802600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1802700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1802800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1802900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1803000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1803100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1803200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1803300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1803400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1803500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1803600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1803700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1803800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1803900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1804000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1804100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1804200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1804300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1804400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1804500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1804600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1804700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1804800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1804900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1805000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1805100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1805200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1805300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1805400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1805500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1805600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1805700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1805800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1805900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1806000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1806100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1806200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1806300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1806400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1806500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1806600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1806700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1806800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1806900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1807000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1807100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1807200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1807300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1807400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1807500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1807600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1807700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1807800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1807900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1808000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1808100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1808200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1808300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1808400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1808500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1808600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1808700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1808800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1808900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1809000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1809100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1809200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1809300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1809400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1809500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1809600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1809700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1809800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1809900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1810000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1810100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1810200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1810300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1810400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1810500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1810600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1810700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1810800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1810900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1811000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1811100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1811200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1811300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1811400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1811500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1811600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1811700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1811800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1811900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1812000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1812100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1812200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1812300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1812400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1812500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1812600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1812700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1812800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1812900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1813000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1813100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1813200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1813300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1813400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1813500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1813600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1813700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1813800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1813900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1814000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1814100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1814200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1814300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1814400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1814500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1814600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1814700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1814800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1814900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1815000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1815100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1815200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1815300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1815400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1815500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1815600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1815700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1815800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1815900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1816000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1816100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1816200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1816300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1816400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1816500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1816600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1816700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1816800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1816900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1817000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1817100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1817200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1817300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1817400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1817500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1817600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1817700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1817800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1817900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1818000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1818100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1818200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1818300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1818400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1818500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1818600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1818700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1818800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1818900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1819000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 1819100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1819200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1819300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1819400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1819500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1819600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1819700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1819800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1819900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1820000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0287628173828125, - "eval_runtime": 3070.2573, - "eval_samples_per_second": 366.329, - "eval_steps_per_second": 22.896, - "step": 1820000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1820100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1820200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1820300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1820400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1820500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1820600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1820700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 1820800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1820900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1821000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1821100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1821200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1821300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1821400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1821500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1821600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1821700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1821800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1821900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1822000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1822100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1822200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1822300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1822400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1822500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1822600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1822700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1822800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1822900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1823000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1823100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1823200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1823300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1823400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1823500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1823600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1823700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1823800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1823900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1824000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1824100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1824200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1824300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1824400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1824500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1824600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1824700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1824800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1824900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1825000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1825100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1825200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1825300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1825400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1825500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1825600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1825700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1825800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1825900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0509, - "step": 1826000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 1826100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1826200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1826300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0506, - "step": 1826400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1826500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1826600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1826700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1826800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1826900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1827000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1827100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1827200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1827300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1827400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1827500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0388, - "step": 1827600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1827700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1827800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1827900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1828000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1828100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1828200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1828300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1828400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1828500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1828600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1828700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1828800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1828900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1829000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1829100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1829200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1829300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1829400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1829500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1829600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1829700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1829800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1829900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1830000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1830100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 1830200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1830300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 1830400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1830500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0392, - "step": 1830600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0373, - "step": 1830700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0394, - "step": 1830800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 1830900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1831000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1831100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1831200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 1831300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1831400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1831500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1831600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 1831700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1831800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1831900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1832000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1832100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1832200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1832300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 1832400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1832500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1832600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1832700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1832800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1832900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1833000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1833100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1833200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1833300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1833400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1833500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1833600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1833700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1833800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1833900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1834000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1834100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1834200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1834300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1834400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1834500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1834600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1834700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1834800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1834900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1835000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1835100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1835200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1835300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1835400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0527, - "step": 1835500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1835600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1835700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1835800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1835900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1836000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1836100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1836200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1836300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1836400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1836500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1836600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1836700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 1836800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0617, - "step": 1836900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1837000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1837100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1837200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1837300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1837400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1837500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1837600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1837700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 1837800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1837900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 1838000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1838100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1838200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1838300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 1838400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1838500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1838600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1838700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1838800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1838900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1839000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1839100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1839200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1839300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1839400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1839500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1839600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1839700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1839800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1839900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1840000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028961181640625, - "eval_runtime": 3049.6835, - "eval_samples_per_second": 368.8, - "eval_steps_per_second": 23.05, - "step": 1840000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1840100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1840200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1840300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1840400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1840500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1840600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1840700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1840800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1840900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1841000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1841100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1841200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1841300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1841400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1841500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1841600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1841700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1841800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1841900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1842000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1842100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1842200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1842300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1842400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1842500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1842600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1842700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1842800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1842900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1843000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1843100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1843200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1843300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1843400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1843500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1843600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1843700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1843800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1843900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1844000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1844100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1844200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1844300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1844400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1844500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1844600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1844700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1844800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1844900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1845000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1845100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1845200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1845300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 1845400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1845500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1845600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1845700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1845800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1845900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1846000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1846100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1846200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1846300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1846400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1846500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1846600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1846700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1846800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1846900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1847000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1847100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1847200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1847300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1847400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1847500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1847600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1847700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1847800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1847900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1848000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1848100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1848200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1848300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1848400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1848500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1848600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1848700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1848800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1848900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1849000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1849100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1849200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1849300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1849400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1849500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1849600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1849700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1849800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1849900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1850000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1850100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1850200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1850300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1850400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1850500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1850600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1850700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1850800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1850900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1851000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1851100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1851200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1851300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1851400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1851500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1851600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1851700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 1851800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1851900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1852000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1852100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1852200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1852300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1852400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1852500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1852600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1852700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1852800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1852900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1853000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1853100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1853200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1853300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1853400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1853500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1853600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1853700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1853800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1853900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1854000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1854100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1854200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1854300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1854400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1854500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1854600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1854700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1854800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1854900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1855000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1855100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1855200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1855300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1855400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1855500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1855600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1855700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 1855800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1855900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1856000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1856100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1856200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1856300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1856400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1856500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1856600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1856700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1856800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1856900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1857000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1857100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1857200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1857300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1857400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1857500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1857600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 1857700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1857800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1857900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1858000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 1858100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1858200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1858300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1858400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1858500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1858600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1858700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1858800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1858900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1859000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1859100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1859200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1859300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1859400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1859500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1859600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1859700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1859800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1859900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1860000 - }, - { - "epoch": 0.0, - "eval_loss": 0.029266357421875, - "eval_runtime": 3065.0985, - "eval_samples_per_second": 366.945, - "eval_steps_per_second": 22.934, - "step": 1860000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1860100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1860200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1860300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1860400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1860500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1860600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1860700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1860800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1860900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1861000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1861100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1861200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1861300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1861400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1861500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1861600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1861700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1861800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1861900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1862000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1862100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1862200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1862300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1862400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1862500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1862600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1862700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1862800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1862900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1863000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1863100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1863200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1863300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1863400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1863500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1863600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1863700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1863800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1863900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1864000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1864100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1864200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1864300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1864400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1864500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1864600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1864700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1864800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1864900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1865000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1865100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1865200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1865300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1865400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1865500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1865600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1865700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1865800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1865900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1866000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1866100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1866200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1866300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1866400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1866500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1866600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1866700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1866800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1866900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1867000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1867100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1867200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1867300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1867400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1867500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1867600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1867700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1867800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1867900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1868000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1868100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1868200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1868300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1868400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1868500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1868600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1868700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1868800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1868900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1869000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1869100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1869200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1869300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1869400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1869500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1869600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1869700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1869800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1869900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1870000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1870100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1870200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1870300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1870400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1870500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1870600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1870700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1870800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1870900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1871000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1871100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1871200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1871300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1871400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1871500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1871600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1871700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1871800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1871900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1872000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1872100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1872200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1872300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1872400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1872500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1872600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1872700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1872800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1872900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1873000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1873100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1873200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1873300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1873400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1873500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1873600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1873700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1873800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1873900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1874000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1874100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1874200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1874300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1874400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1874500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1874600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0397, - "step": 1874700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1874800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1874900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1875000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1875100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1875200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1875300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1875400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1875500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1875600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1875700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1875800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1875900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1876000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1876100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1876200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1876300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1876400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 1876500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1876600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1876700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1876800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1876900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 1877000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1877100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1877200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1877300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1877400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1877500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1877600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1877700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1877800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1877900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1878000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1878100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1878200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1878300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1878400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1878500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1878600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1878700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1878800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1878900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1879000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1879100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1879200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1879300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1879400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1879500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1879600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1879700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1879800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 1879900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1880000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0290069580078125, - "eval_runtime": 3078.4324, - "eval_samples_per_second": 365.356, - "eval_steps_per_second": 22.835, - "step": 1880000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1880100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1880200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1880300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1880400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1880500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1880600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1880700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1880800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1880900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1881000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1881100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1881200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1881300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1881400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1881500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1881600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1881700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1881800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1881900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1882000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1882100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1882200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1882300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1882400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1882500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1882600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1882700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1882800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1882900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1883000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1883100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1883200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1883300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1883400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1883500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1883600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1883700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1883800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1883900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1884000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1884100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1884200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1884300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1884400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1884500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1884600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1884700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1884800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1884900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1885000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1885100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1885200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1885300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1885400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1885500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1885600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1885700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0523, - "step": 1885800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1885900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1886000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1886100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1886200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1886300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1886400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1886500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1886600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1886700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1886800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1886900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1887000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1887100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1887200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1887300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1887400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1887500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1887600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1887700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1887800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1887900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1888000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1888100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1888200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1888300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 1888400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1888500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1888600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1888700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1888800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1888900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1889000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1889100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1889200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1889300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1889400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1889500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1889600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1889700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1889800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1889900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1890000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1890100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1890200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 1890300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1890400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1890500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1890600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1890700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1890800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1890900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1891000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1891100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1891200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1891300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1891400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1891500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1891600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1891700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1891800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1891900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1892000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1892100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1892200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1892300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1892400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1892500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1892600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1892700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1892800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1892900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1893000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1893100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1893200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1893300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1893400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1893500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1893600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1893700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1893800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1893900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1894000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1894100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1894200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1894300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1894400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1894500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1894600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1894700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1894800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1894900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1895000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1895100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1895200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1895300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1895400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1895500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1895600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1895700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1895800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1895900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1896000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1896100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1896200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1896300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1896400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1896500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1896600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1896700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1896800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1896900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1897000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1897100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1897200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1897300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1897400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1897500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1897600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1897700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1897800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1897900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1898000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1898100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1898200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1898300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1898400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1898500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1898600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1898700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1898800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1898900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1899000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1899100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1899200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1899300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1899400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1899500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1899600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1899700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1899800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1899900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1900000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0289764404296875, - "eval_runtime": 3119.3995, - "eval_samples_per_second": 360.558, - "eval_steps_per_second": 22.535, - "step": 1900000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1900100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1900200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1900300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1900400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1900500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1900600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1900700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1900800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1900900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1901000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1901100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1901200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1901300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1901400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1901500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1901600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1901700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1901800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1901900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1902000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1902100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1902200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1902300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1902400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1902500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1902600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1902700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1902800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1902900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1903000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1903100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1903200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1903300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1903400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1903500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1903600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1903700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1903800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1903900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1904000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1904100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1904200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 1904300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1904400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1904500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1904600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1904700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1904800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1904900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0381, - "step": 1905000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1905100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1905200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 1905300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1905400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1905500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1905600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1905700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1905800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1905900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1906000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1906100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1906200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1906300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1906400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1906500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1906600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1906700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1906800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1906900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1907000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1907100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1907200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1907300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1907400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1907500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1907600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1907700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1907800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1907900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1908000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1908100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1908200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1908300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1908400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1908500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1908600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1908700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1908800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1908900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1909000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1909100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1909200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1909300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1909400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1909500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1909600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1909700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1909800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1909900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1910000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1910100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1910200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1910300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1910400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1910500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1910600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1910700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1910800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1910900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1911000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1911100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1911200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1911300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1911400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1911500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1911600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1911700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1911800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1911900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1912000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1912100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1912200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1912300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1912400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1912500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1912600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1912700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1912800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1912900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1913000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1913100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1913200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1913300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1913400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1913500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1913600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1913700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1913800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1913900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1914000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1914100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1914200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1914300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1914400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1914500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1914600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1914700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1914800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1914900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1915000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1915100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1915200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1915300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1915400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1915500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1915600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1915700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1915800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1915900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1916000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1916100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1916200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1916300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1916400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1916500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1916600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1916700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1916800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1916900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1917000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1917100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1917200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1917300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1917400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1917500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1917600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1917700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1917800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1917900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1918000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1918100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1918200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1918300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1918400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1918500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1918600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1918700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1918800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1918900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1919000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1919100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1919200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1919300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1919400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1919500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1919600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1919700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1919800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1919900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1920000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028839111328125, - "eval_runtime": 3158.9184, - "eval_samples_per_second": 356.047, - "eval_steps_per_second": 22.253, - "step": 1920000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1920100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1920200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1920300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1920400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1920500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1920600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1920700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1920800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1920900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1921000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1921100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1921200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1921300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1921400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1921500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1921600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1921700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1921800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1921900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1922000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1922100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1922200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1922300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1922400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1922500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1922600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1922700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1922800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1922900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1923000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1923100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1923200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1923300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1923400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1923500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1923600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1923700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1923800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1923900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1924000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1924100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1924200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1924300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1924400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1924500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1924600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1924700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1924800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1924900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1925000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1925100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1925200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1925300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1925400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1925500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1925600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1925700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1925800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1925900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1926000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1926100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1926200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1926300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1926400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1926500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1926600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1926700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1926800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1926900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1927000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1927100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1927200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1927300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1927400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1927500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1927600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1927700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1927800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1927900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1928000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 1928100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1928200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1928300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1928400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1928500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1928600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1928700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1928800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1928900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1929000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1929100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1929200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1929300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1929400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1929500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1929600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1929700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1929800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1929900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1930000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1930100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1930200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1930300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1930400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1930500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 1930600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1930700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1930800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1930900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1931000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1931100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1931200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1931300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1931400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1931500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1931600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1931700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1931800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1931900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1932000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1932100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1932200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1932300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1932400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1932500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1932600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1932700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1932800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1932900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1933000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1933100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1933200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1933300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1933400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1933500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1933600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1933700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1933800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1933900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1934000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1934100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1934200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1934300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1934400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1934500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1934600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1934700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1934800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1934900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1935000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1935100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1935200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1935300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1935400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1935500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1935600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1935700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1935800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1935900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1936000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1936100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1936200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1936300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1936400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1936500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1936600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1936700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1936800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1936900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1937000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1937100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1937200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1937300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1937400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1937500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1937600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1937700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1937800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1937900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1938000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1938100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1938200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1938300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 1938400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1938500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1938600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1938700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1938800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1938900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1939000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1939100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1939200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1939300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1939400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1939500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1939600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1939700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1939800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1939900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1940000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028839111328125, - "eval_runtime": 3465.8168, - "eval_samples_per_second": 324.519, - "eval_steps_per_second": 20.283, - "step": 1940000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1940100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1940200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1940300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 1940400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1940500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1940600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1940700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1940800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1940900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1941000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0378, - "step": 1941100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1941200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1941300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1941400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1941500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1941600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1941700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1941800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1941900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1942000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1942100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1942200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1942300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1942400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1942500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1942600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1942700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1942800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1942900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1943000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1943100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1943200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1943300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1943400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1943500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1943600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1943700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1943800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1943900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1944000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1944100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1944200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1944300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1944400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1944500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1944600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1944700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1944800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1944900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1945000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1945100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1945200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1945300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1945400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1945500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1945600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1945700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1945800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1945900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1946000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1946100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1946200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1946300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1946400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1946500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1946600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1946700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1946800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1946900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1947000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1947100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1947200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1947300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1947400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1947500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1947600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1947700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1947800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1947900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1948000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1948100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1948200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1948300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1948400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1948500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1948600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1948700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1948800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1948900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1949000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1949100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1949200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1949300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1949400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1949500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1949600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1949700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1949800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1949900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1950000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1950100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1950200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1950300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1950400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1950500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 1950600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1950700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1950800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1950900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1951000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1951100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1951200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1951300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1951400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1951500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1951600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1951700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1951800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 1951900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1952000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1952100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1952200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1952300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1952400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1952500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1952600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1952700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1952800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1952900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1953000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1953100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1953200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1953300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1953400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1953500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1953600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1953700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1953800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1953900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1954000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1954100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1954200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 1954300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1954400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1954500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1954600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1954700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1954800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1954900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1955000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1955100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1955200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1955300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1955400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1955500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1955600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1955700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1955800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1955900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1956000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1956100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1956200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1956300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1956400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1956500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1956600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1956700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1956800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1956900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1957000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1957100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1957200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1957300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1957400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1957500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1957600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1957700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1957800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1957900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1958000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1958100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1958200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1958300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1958400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1958500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1958600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1958700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1958800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1958900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1959000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1959100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1959200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1959300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1959400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 1959500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1959600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1959700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1959800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1959900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1960000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0286865234375, - "eval_runtime": 3421.7973, - "eval_samples_per_second": 328.694, - "eval_steps_per_second": 20.544, - "step": 1960000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1960100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1960200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1960300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 1960400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1960500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1960600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1960700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1960800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1960900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1961000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1961100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1961200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1961300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1961400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1961500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1961600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1961700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1961800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1961900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1962000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1962100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1962200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1962300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1962400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1962500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1962600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1962700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1962800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1962900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1963000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1963100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1963200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1963300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1963400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1963500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1963600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1963700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1963800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1963900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1964000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1964100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 1964200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1964300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1964400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1964500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1964600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1964700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1964800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1964900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1965000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1965100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1965200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1965300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1965400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1965500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1965600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1965700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 1965800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1965900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1966000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1966100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1966200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1966300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1966400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1966500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1966600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1966700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1966800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1966900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1967000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1967100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1967200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1967300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1967400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1967500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1967600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1967700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1967800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1967900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1968000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 1968100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 1968200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1968300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1968400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1968500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1968600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1968700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1968800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1968900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1969000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1969100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1969200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1969300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1969400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1969500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1969600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1969700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 1969800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1969900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1970000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1970100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1970200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1970300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1970400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1970500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1970600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1970700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1970800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1970900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1971000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1971100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1971200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1971300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1971400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1971500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1971600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1971700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1971800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 1971900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1972000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1972100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1972200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1972300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1972400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1972500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1972600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1972700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1972800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1972900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1973000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1973100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1973200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1973300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1973400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1973500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1973600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1973700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1973800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1973900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1974000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1974100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1974200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1974300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1974400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1974500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1974600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1974700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 1974800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1974900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1975000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1975100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1975200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1975300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1975400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1975500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1975600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1975700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1975800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1975900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1976000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1976100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1976200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1976300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1976400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1976500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1976600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 1976700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1976800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 1976900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1977000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 1977100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1977200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1977300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1977400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1977500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1977600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1977700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1977800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1977900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1978000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1978100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1978200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1978300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1978400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1978500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1978600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1978700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1978800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1978900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1979000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1979100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1979200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1979300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1979400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1979500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1979600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1979700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1979800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1979900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 1980000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0838623046875, - "eval_runtime": 3306.5915, - "eval_samples_per_second": 340.146, - "eval_steps_per_second": 21.259, - "step": 1980000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0407, - "step": 1980100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1980200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1980300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1980400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1980500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1980600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1980700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1980800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1980900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1981000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1981100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1981200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1981300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1981400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1981500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1981600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1981700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1981800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1981900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1982000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1982100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1982200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 1982300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1982400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 1982500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1982600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1982700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1982800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1982900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 1983000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1983100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1983200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1983300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1983400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1983500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1983600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1983700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1983800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1983900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1984000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1984100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1984200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1984300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1984400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1984500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1984600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1984700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1984800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1984900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1985000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1985100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 1985200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1985300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1985400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1985500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1985600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1985700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1985800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1985900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1986000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1986100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 1986200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1986300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1986400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1986500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1986600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1986700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1986800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1986900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1987000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1987100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1987200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1987300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1987400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1987500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1987600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1987700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1987800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1987900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1988000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 1988100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1988200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1988300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1988400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1988500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1988600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1988700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 1988800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1988900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1989000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1989100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1989200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1989300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1989400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1989500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1989600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1989700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 1989800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1989900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1990000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1990100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1990200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1990300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 1990400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 1990500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1990600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1990700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1990800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1990900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1991000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 1991100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1991200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1991300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1991400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1991500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1991600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1991700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1991800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1991900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1992000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1992100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1992200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1992300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 1992400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1992500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1992600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 1992700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1992800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1992900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1993000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1993100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1993200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 1993300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 1993400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 1993500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1993600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1993700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1993800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 1993900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 1994000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1994100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1994200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1994300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1994400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1994500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1994600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1994700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1994800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 1994900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1995000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1995100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1995200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1995300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 1995400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1995500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1995600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1995700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1995800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1995900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 1996000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1996100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1996200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1996300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1996400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1996500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1996600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 1996700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1996800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1996900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1997000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 1997100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 1997200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1997300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1997400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1997500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 1997600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 1997700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1997800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 1997900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1998000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 1998100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 1998200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 1998300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1998400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 1998500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 1998600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 1998700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 1998800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1998900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 1999000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 1999100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 1999200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 1999300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1999400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1999500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 1999600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 1999700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 1999800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 1999900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2000000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0287628173828125, - "eval_runtime": 3627.9486, - "eval_samples_per_second": 310.016, - "eval_steps_per_second": 19.376, - "step": 2000000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2000100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2000200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2000300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2000400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2000500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2000600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2000700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2000800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2000900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2001000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2001100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2001200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2001300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2001400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2001500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2001600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2001700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2001800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2001900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2002000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2002100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2002200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2002300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2002400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2002500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2002600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2002700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2002800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2002900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2003000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2003100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2003200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2003300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2003400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2003500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2003600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2003700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2003800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2003900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2004000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2004100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2004200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2004300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2004400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2004500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2004600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2004700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2004800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2004900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2005000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2005100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2005200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2005300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2005400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2005500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2005600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2005700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2005800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2005900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2006000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2006100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2006200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2006300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2006400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2006500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2006600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2006700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2006800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2006900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2007000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2007100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2007200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2007300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2007400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2007500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2007600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2007700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2007800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2007900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2008000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2008100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2008200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2008300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2008400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2008500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2008600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2008700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2008800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2008900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2009000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2009100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2009200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2009300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2009400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2009500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2009600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2009700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 2009800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2009900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2010000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2010100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2010200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2010300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2010400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2010500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2010600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2010700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2010800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2010900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2011000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2011100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2011200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2011300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2011400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2011500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2011600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2011700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2011800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2011900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2012000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2012100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2012200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2012300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2012400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2012500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2012600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2012700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2012800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2012900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2013000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2013100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2013200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2013300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2013400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2013500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2013600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2013700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2013800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2013900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2014000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2014100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2014200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2014300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2014400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2014500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2014600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2014700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2014800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2014900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2015000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2015100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2015200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2015300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2015400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2015500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2015600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2015700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2015800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2015900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2016000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2016100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 2016200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 2016300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2016400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2016500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 2016600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2016700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2016800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2016900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2017000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2017100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2017200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2017300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2017400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2017500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2017600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2017700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2017800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 2017900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 2018000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2018100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2018200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2018300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2018400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2018500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2018600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2018700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2018800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2018900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2019000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2019100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2019200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2019300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2019400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2019500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2019600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2019700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2019800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2019900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2020000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02899169921875, - "eval_runtime": 3078.8528, - "eval_samples_per_second": 365.306, - "eval_steps_per_second": 22.832, - "step": 2020000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2020100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 2020200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2020300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2020400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2020500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2020600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2020700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2020800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2020900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2021000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2021100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2021200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2021300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2021400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2021500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2021600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2021700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2021800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2021900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2022000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2022100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2022200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2022300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2022400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2022500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2022600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2022700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2022800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2022900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2023000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2023100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2023200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2023300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2023400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2023500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2023600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2023700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2023800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2023900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2024000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2024100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2024200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2024300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2024400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2024500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2024600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2024700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2024800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2024900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2025000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2025100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2025200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2025300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2025400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2025500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2025600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2025700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2025800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2025900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2026000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2026100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2026200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2026300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2026400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2026500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2026600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2026700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2026800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2026900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2027000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2027100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2027200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2027300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2027400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2027500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2027600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2027700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2027800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2027900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2028000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2028100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2028200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2028300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2028400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2028500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2028600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2028700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2028800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2028900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2029000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2029100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2029200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2029300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2029400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2029500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2029600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2029700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2029800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2029900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2030000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2030100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2030200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2030300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2030400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2030500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2030600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2030700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2030800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2030900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2031000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2031100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2031200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2031300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2031400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2031500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2031600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2031700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2031800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2031900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2032000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2032100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2032200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2032300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2032400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2032500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2032600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2032700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2032800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2032900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2033000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2033100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2033200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2033300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2033400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2033500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2033600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2033700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2033800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2033900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2034000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2034100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2034200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2034300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2034400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2034500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2034600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2034700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2034800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2034900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2035000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2035100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2035200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2035300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2035400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2035500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2035600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2035700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2035800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2035900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2036000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2036100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2036200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2036300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2036400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2036500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2036600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2036700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2036800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2036900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2037000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2037100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2037200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2037300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2037400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2037500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2037600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2037700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2037800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2037900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2038000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2038100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2038200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2038300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2038400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2038500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2038600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2038700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2038800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2038900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2039000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2039100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2039200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2039300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2039400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2039500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2039600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2039700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2039800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2039900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2040000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0289764404296875, - "eval_runtime": 3149.7392, - "eval_samples_per_second": 357.084, - "eval_steps_per_second": 22.318, - "step": 2040000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2040100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2040200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2040300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2040400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2040500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2040600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2040700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2040800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2040900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2041000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2041100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2041200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2041300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2041400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2041500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2041600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2041700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2041800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2041900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2042000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2042100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2042200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2042300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2042400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2042500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2042600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2042700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2042800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2042900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2043000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2043100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2043200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2043300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2043400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2043500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2043600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2043700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2043800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2043900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2044000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2044100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2044200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2044300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2044400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2044500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2044600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2044700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2044800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2044900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2045000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2045100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2045200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2045300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2045400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2045500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2045600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2045700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2045800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2045900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2046000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2046100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2046200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2046300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2046400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2046500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2046600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2046700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2046800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2046900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2047000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2047100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2047200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2047300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2047400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2047500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2047600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2047700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2047800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2047900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2048000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2048100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2048200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2048300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2048400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2048500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2048600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2048700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2048800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2048900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2049000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2049100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2049200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2049300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2049400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2049500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2049600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2049700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2049800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2049900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2050000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2050100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2050200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2050300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2050400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2050500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2050600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2050700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2050800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2050900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2051000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2051100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2051200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2051300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2051400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2051500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2051600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2051700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2051800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2051900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2052000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2052100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2052200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2052300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2052400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2052500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2052600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2052700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2052800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2052900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2053000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2053100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2053200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2053300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2053400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2053500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2053600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2053700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2053800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2053900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2054000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2054100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2054200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2054300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2054400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2054500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2054600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2054700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2054800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2054900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2055000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2055100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2055200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2055300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2055400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2055500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2055600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2055700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2055800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2055900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2056000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2056100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2056200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2056300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2056400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2056500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2056600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2056700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2056800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2056900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2057000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2057100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2057200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2057300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2057400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2057500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2057600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2057700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2057800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2057900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2058000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2058100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2058200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2058300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2058400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2058500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2058600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2058700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2058800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2058900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2059000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2059100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2059200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2059300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2059400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2059500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2059600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2059700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2059800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2059900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2060000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0288543701171875, - "eval_runtime": 3840.4216, - "eval_samples_per_second": 292.864, - "eval_steps_per_second": 18.304, - "step": 2060000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2060100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2060200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2060300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2060400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2060500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2060600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2060700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2060800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2060900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2061000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2061100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2061200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2061300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2061400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2061500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2061600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2061700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2061800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2061900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2062000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2062100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2062200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2062300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2062400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2062500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2062600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2062700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2062800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 2062900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2063000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2063100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2063200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2063300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2063400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2063500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2063600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2063700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2063800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2063900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2064000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2064100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2064200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2064300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2064400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2064500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2064600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2064700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2064800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0507, - "step": 2064900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2065000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2065100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2065200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2065300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2065400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2065500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2065600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2065700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2065800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2065900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2066000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2066100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2066200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2066300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2066400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2066500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2066600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2066700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2066800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2066900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2067000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2067100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2067200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2067300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2067400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2067500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2067600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2067700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2067800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2067900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2068000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2068100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2068200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2068300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2068400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2068500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2068600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2068700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2068800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2068900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2069000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2069100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2069200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2069300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2069400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2069500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2069600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2069700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2069800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2069900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2070000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2070100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2070200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2070300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2070400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2070500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2070600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2070700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2070800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2070900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2071000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2071100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2071200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2071300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2071400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2071500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2071600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2071700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2071800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2071900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2072000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2072100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2072200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2072300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2072400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2072500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2072600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2072700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2072800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2072900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2073000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2073100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2073200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2073300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2073400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2073500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2073600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2073700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2073800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2073900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2074000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2074100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2074200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2074300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2074400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2074500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2074600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2074700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2074800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2074900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2075000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2075100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2075200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2075300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2075400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2075500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2075600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2075700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2075800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2075900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2076000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2076100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2076200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2076300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2076400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2076500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2076600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2076700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2076800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2076900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2077000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2077100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2077200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2077300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2077400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2077500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2077600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2077700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2077800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2077900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2078000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2078100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2078200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2078300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2078400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2078500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2078600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2078700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2078800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2078900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2079000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2079100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2079200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2079300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2079400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2079500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2079600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2079700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2079800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2079900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2080000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028472900390625, - "eval_runtime": 3134.6894, - "eval_samples_per_second": 358.799, - "eval_steps_per_second": 22.425, - "step": 2080000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2080100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2080200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2080300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2080400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2080500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2080600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2080700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2080800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2080900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2081000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2081100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2081200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2081300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2081400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2081500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2081600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2081700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2081800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2081900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2082000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2082100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2082200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2082300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2082400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2082500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2082600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2082700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2082800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2082900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2083000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2083100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2083200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2083300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2083400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2083500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2083600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2083700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2083800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2083900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2084000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2084100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2084200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2084300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2084400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2084500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2084600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2084700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2084800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2084900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2085000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2085100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2085200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2085300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2085400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2085500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2085600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2085700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2085800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2085900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2086000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2086100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2086200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2086300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2086400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2086500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2086600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2086700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2086800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2086900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2087000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2087100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2087200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2087300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2087400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2087500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2087600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2087700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2087800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2087900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2088000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2088100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2088200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2088300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2088400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2088500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2088600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2088700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2088800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2088900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 2089000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2089100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2089200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2089300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2089400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2089500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2089600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2089700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2089800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2089900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2090000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2090100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2090200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2090300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2090400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2090500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2090600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2090700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2090800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2090900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2091000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2091100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2091200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2091300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2091400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2091500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2091600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2091700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2091800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2091900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2092000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2092100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2092200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2092300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2092400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2092500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2092600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2092700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2092800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2092900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 2093000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2093100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2093200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2093300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2093400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2093500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2093600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2093700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2093800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2093900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2094000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2094100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2094200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2094300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2094400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2094500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2094600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2094700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2094800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2094900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2095000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2095100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2095200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2095300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 2095400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2095500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2095600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2095700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2095800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2095900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2096000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2096100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2096200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2096300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2096400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2096500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2096600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2096700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2096800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2096900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2097000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2097100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2097200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2097300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2097400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2097500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2097600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2097700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2097800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2097900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2098000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2098100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2098200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2098300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2098400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2098500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2098600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2098700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2098800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2098900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2099000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2099100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2099200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2099300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2099400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2099500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2099600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2099700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2099800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2099900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2100000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0286865234375, - "eval_runtime": 3215.2333, - "eval_samples_per_second": 349.811, - "eval_steps_per_second": 21.863, - "step": 2100000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2100100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2100200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2100300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2100400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2100500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2100600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2100700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2100800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2100900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2101000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2101100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2101200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2101300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2101400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2101500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2101600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2101700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2101800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2101900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2102000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2102100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2102200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2102300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2102400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2102500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2102600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2102700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2102800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2102900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2103000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2103100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2103200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2103300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2103400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2103500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2103600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2103700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2103800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2103900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2104000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2104100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2104200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2104300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2104400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2104500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2104600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2104700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2104800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2104900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2105000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2105100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2105200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2105300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2105400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2105500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2105600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2105700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2105800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2105900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2106000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2106100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2106200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2106300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2106400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2106500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2106600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2106700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2106800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2106900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2107000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2107100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2107200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2107300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2107400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2107500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2107600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2107700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2107800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2107900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2108000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2108100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2108200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2108300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2108400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2108500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2108600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2108700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2108800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2108900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2109000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2109100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2109200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2109300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2109400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2109500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2109600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2109700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2109800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2109900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2110000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2110100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2110200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2110300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2110400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2110500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2110600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2110700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2110800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2110900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2111000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2111100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2111200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2111300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2111400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2111500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2111600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2111700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2111800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2111900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2112000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2112100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2112200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2112300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2112400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2112500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2112600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2112700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2112800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2112900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2113000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2113100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2113200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2113300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2113400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2113500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2113600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2113700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2113800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2113900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2114000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2114100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2114200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2114300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2114400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2114500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2114600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2114700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2114800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2114900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2115000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2115100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2115200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2115300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2115400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2115500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2115600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2115700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2115800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2115900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2116000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2116100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2116200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2116300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2116400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2116500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2116600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2116700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2116800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2116900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2117000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2117100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2117200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2117300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2117400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2117500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2117600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0286, - "step": 2117700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2117800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2117900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2118000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2118100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2118200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2118300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2118400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2118500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2118600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2118700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2118800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2118900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2119000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2119100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2119200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2119300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2119400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2119500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2119600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2119700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2119800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2119900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2120000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284881591796875, - "eval_runtime": 3185.3885, - "eval_samples_per_second": 353.088, - "eval_steps_per_second": 22.068, - "step": 2120000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2120100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2120200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2120300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2120400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2120500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2120600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2120700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2120800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2120900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2121000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2121100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2121200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2121300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2121400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2121500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2121600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2121700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2121800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2121900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2122000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2122100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2122200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2122300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2122400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2122500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2122600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2122700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2122800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2122900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2123000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2123100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2123200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2123300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2123400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2123500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2123600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2123700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2123800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2123900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2124000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2124100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2124200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2124300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2124400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2124500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2124600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2124700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2124800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2124900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2125000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2125100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2125200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2125300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2125400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2125500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2125600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2125700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2125800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2125900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2126000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2126100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2126200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2126300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2126400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2126500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2126600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2126700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2126800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2126900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2127000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2127100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2127200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2127300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2127400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2127500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2127600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2127700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2127800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2127900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2128000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2128100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2128200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2128300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2128400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2128500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2128600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2128700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2128800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2128900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2129000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2129100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2129200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2129300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2129400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2129500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2129600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2129700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2129800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2129900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2130000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2130100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2130200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2130300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2130400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2130500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2130600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2130700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2130800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2130900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2131000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2131100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2131200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2131300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2131400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2131500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2131600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2131700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2131800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2131900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2132000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2132100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2132200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2132300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2132400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2132500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2132600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2132700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2132800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2132900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2133000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2133100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2133200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2133300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2133400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2133500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2133600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2133700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2133800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2133900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2134000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2134100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2134200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2134300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2134400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2134500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2134600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2134700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2134800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2134900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2135000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2135100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2135200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2135300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2135400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2135500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2135600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2135700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2135800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2135900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2136000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2136100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2136200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2136300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2136400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2136500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2136600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2136700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2136800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2136900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2137000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2137100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2137200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2137300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2137400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2137500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2137600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2137700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2137800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2137900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2138000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2138100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2138200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2138300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2138400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2138500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2138600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2138700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2138800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2138900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2139000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2139100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2139200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2139300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2139400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2139500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2139600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2139700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2139800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2139900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2140000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284576416015625, - "eval_runtime": 3176.7209, - "eval_samples_per_second": 354.052, - "eval_steps_per_second": 22.128, - "step": 2140000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2140100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2140200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2140300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2140400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2140500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2140600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2140700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2140800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2140900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2141000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2141100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2141200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2141300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2141400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2141500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2141600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2141700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2141800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2141900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2142000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2142100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2142200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2142300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2142400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2142500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2142600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2142700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2142800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2142900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2143000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2143100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2143200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2143300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2143400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2143500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2143600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2143700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2143800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2143900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2144000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2144100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2144200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2144300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2144400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2144500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2144600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2144700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2144800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2144900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2145000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2145100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2145200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2145300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2145400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2145500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2145600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2145700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2145800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2145900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2146000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2146100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2146200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2146300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2146400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2146500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2146600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2146700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2146800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2146900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2147000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2147100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2147200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2147300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2147400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2147500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2147600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2147700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2147800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2147900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2148000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2148100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2148200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2148300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2148400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2148500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2148600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2148700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2148800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2148900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2149000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2149100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2149200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2149300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2149400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2149500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2149600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2149700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2149800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2149900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2150000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2150100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2150200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2150300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2150400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2150500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2150600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2150700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2150800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2150900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2151000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2151100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2151200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2151300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2151400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2151500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2151600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2151700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2151800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2151900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2152000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2152100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2152200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2152300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2152400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2152500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2152600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2152700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2152800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2152900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2153000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2153100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2153200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2153300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2153400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2153500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2153600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2153700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2153800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2153900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2154000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2154100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2154200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2154300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2154400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2154500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2154600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2154700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2154800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2154900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2155000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2155100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2155200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2155300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2155400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2155500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2155600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2155700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2155800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2155900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2156000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2156100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2156200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2156300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2156400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2156500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2156600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2156700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2156800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2156900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2157000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2157100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2157200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2157300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2157400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2157500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2157600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2157700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2157800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2157900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2158000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2158100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2158200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2158300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2158400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2158500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2158600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2158700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2158800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2158900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2159000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2159100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2159200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2159300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2159400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2159500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2159600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2159700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2159800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2159900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2160000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028472900390625, - "eval_runtime": 3202.5066, - "eval_samples_per_second": 351.201, - "eval_steps_per_second": 21.95, - "step": 2160000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2160100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2160200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2160300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2160400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2160500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2160600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2160700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2160800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2160900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2161000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2161100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2161200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2161300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2161400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2161500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2161600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2161700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2161800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2161900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2162000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2162100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2162200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0363, - "step": 2162300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2162400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2162500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2162600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2162700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2162800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2162900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2163000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2163100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2163200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2163300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2163400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2163500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 2163600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2163700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2163800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2163900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2164000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2164100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2164200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2164300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2164400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2164500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2164600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2164700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 2164800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2164900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2165000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2165100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0413, - "step": 2165200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2165300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2165400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2165500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2165600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2165700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2165800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2165900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2166000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2166100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 2166200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2166300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2166400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2166500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2166600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2166700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2166800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2166900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2167000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2167100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2167200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2167300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2167400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2167500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2167600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2167700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2167800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2167900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2168000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2168100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2168200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2168300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2168400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2168500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2168600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2168700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2168800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2168900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2169000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2169100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2169200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2169300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2169400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2169500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2169600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2169700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2169800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2169900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2170000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2170100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2170200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2170300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2170400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2170500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2170600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2170700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 2170800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2170900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2171000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2171100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2171200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2171300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2171400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2171500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2171600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2171700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2171800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2171900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2172000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2172100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2172200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2172300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2172400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2172500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2172600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2172700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2172800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2172900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2173000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2173100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2173200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2173300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2173400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2173500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2173600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2173700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2173800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2173900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2174000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2174100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2174200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2174300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2174400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2174500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2174600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2174700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2174800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2174900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2175000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2175100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2175200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2175300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2175400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2175500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2175600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2175700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2175800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2175900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2176000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2176100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2176200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2176300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2176400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2176500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2176600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2176700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2176800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2176900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2177000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2177100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2177200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2177300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2177400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2177500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2177600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2177700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2177800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2177900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2178000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2178100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2178200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2178300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2178400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2178500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2178600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2178700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2178800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2178900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2179000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2179100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2179200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2179300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2179400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2179500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2179600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2179700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2179800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2179900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2180000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028778076171875, - "eval_runtime": 3226.9277, - "eval_samples_per_second": 348.543, - "eval_steps_per_second": 21.784, - "step": 2180000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2180100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2180200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2180300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2180400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2180500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2180600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2180700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2180800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2180900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2181000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2181100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2181200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2181300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2181400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2181500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2181600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2181700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2181800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2181900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2182000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2182100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2182200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2182300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2182400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2182500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2182600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2182700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2182800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2182900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2183000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2183100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2183200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2183300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2183400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2183500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2183600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2183700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2183800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2183900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2184000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2184100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2184200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2184300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2184400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2184500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2184600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2184700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2184800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2184900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2185000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2185100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2185200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2185300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2185400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2185500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2185600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2185700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2185800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2185900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2186000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2186100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2186200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2186300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2186400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2186500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2186600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2186700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2186800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2186900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2187000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2187100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2187200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2187300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2187400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2187500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2187600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2187700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2187800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2187900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2188000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2188100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2188200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2188300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2188400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2188500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2188600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2188700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2188800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2188900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2189000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2189100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2189200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2189300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2189400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2189500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2189600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2189700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2189800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2189900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2190000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2190100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2190200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2190300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2190400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2190500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2190600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2190700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2190800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2190900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2191000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2191100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2191200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2191300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0519, - "step": 2191400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0475, - "step": 2191500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2191600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2191700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2191800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2191900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2192000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2192100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2192200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2192300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2192400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0358, - "step": 2192500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2192600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2192700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2192800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2192900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2193000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2193100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2193200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2193300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2193400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2193500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2193600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2193700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2193800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2193900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2194000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2194100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 2194200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2194300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2194400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2194500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2194600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2194700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2194800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2194900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2195000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2195100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2195200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2195300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2195400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2195500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2195600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2195700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2195800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2195900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2196000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2196100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2196200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2196300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2196400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2196500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2196600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2196700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2196800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2196900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2197000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2197100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2197200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2197300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2197400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2197500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2197600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2197700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2197800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2197900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2198000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2198100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2198200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2198300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2198400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2198500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2198600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2198700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2198800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2198900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2199000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2199100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2199200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2199300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2199400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2199500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2199600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2199700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2199800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2199900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2200000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284271240234375, - "eval_runtime": 2908.5149, - "eval_samples_per_second": 386.7, - "eval_steps_per_second": 24.169, - "step": 2200000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2200100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2200200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2200300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2200400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2200500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2200600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2200700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2200800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2200900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2201000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2201100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2201200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2201300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0369, - "step": 2201400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2201500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2201600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2201700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2201800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2201900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2202000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2202100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2202200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2202300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2202400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2202500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2202600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2202700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2202800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2202900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2203000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2203100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2203200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2203300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2203400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2203500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2203600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2203700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2203800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2203900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2204000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2204100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2204200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2204300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2204400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2204500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2204600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2204700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2204800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2204900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2205000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2205100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2205200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2205300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2205400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2205500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2205600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2205700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2205800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2205900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2206000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2206100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2206200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2206300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2206400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2206500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2206600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2206700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 2206800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2206900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2207000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2207100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2207200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2207300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2207400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2207500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2207600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2207700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2207800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2207900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2208000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2208100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2208200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2208300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2208400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2208500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2208600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2208700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2208800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2208900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2209000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2209100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2209200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2209300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2209400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2209500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2209600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2209700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2209800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2209900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2210000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2210100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2210200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2210300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2210400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2210500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2210600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2210700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2210800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2210900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2211000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2211100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2211200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2211300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2211400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2211500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2211600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2211700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2211800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2211900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2212000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2212100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2212200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2212300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2212400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2212500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2212600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2212700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2212800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2212900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2213000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2213100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2213200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2213300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2213400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2213500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2213600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2213700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2213800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2213900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2214000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2214100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2214200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2214300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2214400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2214500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2214600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2214700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2214800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2214900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2215000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2215100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2215200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2215300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2215400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2215500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2215600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2215700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2215800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2215900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2216000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2216100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2216200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2216300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2216400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2216500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2216600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2216700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2216800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2216900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2217000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2217100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2217200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2217300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2217400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2217500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2217600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2217700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2217800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2217900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2218000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2218100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2218200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2218300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2218400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2218500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2218600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2218700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2218800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2218900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2219000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2219100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2219200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2219300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2219400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2219500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2219600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2219700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2219800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2219900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2220000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283355712890625, - "eval_runtime": 3375.4901, - "eval_samples_per_second": 333.203, - "eval_steps_per_second": 20.825, - "step": 2220000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2220100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2220200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2220300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2220400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2220500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2220600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2220700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2220800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2220900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2221000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2221100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2221200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2221300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2221400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2221500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2221600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2221700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2221800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2221900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2222000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2222100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2222200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2222300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2222400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2222500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2222600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2222700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2222800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2222900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2223000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2223100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2223200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2223300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2223400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2223500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2223600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2223700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2223800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2223900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2224000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2224100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2224200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2224300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2224400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2224500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2224600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2224700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2224800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2224900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2225000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2225100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2225200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2225300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2225400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2225500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2225600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2225700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2225800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2225900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2226000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2226100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2226200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2226300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2226400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2226500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2226600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2226700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2226800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2226900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2227000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2227100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2227200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2227300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2227400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2227500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2227600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2227700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2227800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2227900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2228000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2228100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2228200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2228300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2228400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2228500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2228600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2228700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2228800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2228900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2229000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2229100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2229200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2229300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2229400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2229500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2229600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2229700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2229800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2229900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2230000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2230100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2230200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2230300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2230400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2230500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2230600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2230700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2230800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2230900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2231000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2231100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2231200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2231300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2231400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2231500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2231600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2231700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2231800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2231900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2232000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2232100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2232200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2232300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2232400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2232500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2232600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2232700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2232800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2232900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2233000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2233100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2233200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2233300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2233400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2233500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2233600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2233700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2233800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2233900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2234000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2234100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2234200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2234300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2234400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2234500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2234600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2234700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2234800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2234900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2235000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2235100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2235200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2235300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2235400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2235500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2235600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2235700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2235800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2235900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2236000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2236100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2236200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2236300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2236400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2236500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2236600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2236700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2236800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2236900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2237000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2237100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2237200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2237300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2237400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2237500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2237600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2237700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2237800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2237900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2238000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2238100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2238200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2238300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2238400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2238500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2238600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2238700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2238800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2238900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2239000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2239100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2239200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2239300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2239400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2239500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2239600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2239700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2239800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2239900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2240000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02838134765625, - "eval_runtime": 3108.7086, - "eval_samples_per_second": 361.797, - "eval_steps_per_second": 22.613, - "step": 2240000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2240100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2240200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2240300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2240400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2240500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2240600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2240700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2240800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2240900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2241000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2241100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2241200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2241300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2241400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2241500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2241600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2241700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2241800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2241900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2242000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2242100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2242200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2242300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2242400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2242500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0755, - "step": 2242600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2242700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2242800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2242900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2243000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2243100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2243200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2243300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2243400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2243500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2243600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2243700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2243800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2243900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2244000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2244100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2244200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2244300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2244400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2244500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2244600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2244700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2244800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0573, - "step": 2244900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2245000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2245100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2245200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2245300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2245400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2245500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2245600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2245700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2245800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2245900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2246000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2246100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2246200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2246300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2246400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2246500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2246600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2246700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2246800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2246900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2247000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2247100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2247200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2247300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2247400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2247500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2247600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2247700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2247800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2247900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2248000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2248100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2248200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2248300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2248400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2248500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2248600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2248700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2248800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2248900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2249000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2249100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2249200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2249300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2249400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2249500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2249600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2249700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2249800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2249900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2250000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2250100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2250200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2250300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2250400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2250500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2250600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2250700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2250800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2250900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2251000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2251100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2251200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2251300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2251400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2251500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2251600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2251700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2251800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2251900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2252000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2252100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2252200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2252300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2252400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2252500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2252600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2252700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2252800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2252900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2253000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2253100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2253200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2253300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2253400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2253500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2253600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2253700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2253800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2253900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2254000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2254100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2254200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2254300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2254400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2254500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2254600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2254700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2254800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2254900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2255000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2255100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2255200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2255300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2255400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2255500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2255600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2255700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2255800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2255900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2256000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2256100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2256200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2256300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2256400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2256500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2256600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2256700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2256800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2256900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2257000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2257100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2257200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2257300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2257400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2257500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2257600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2257700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2257800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2257900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2258000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2258100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2258200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2258300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2258400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2258500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2258600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2258700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2258800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2258900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2259000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2259100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2259200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2259300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2259400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2259500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2259600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2259700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2259800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2259900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2260000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0285491943359375, - "eval_runtime": 3338.9852, - "eval_samples_per_second": 336.846, - "eval_steps_per_second": 21.053, - "step": 2260000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2260100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2260200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2260300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2260400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2260500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2260600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2260700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2260800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2260900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2261000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2261100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2261200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2261300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2261400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2261500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2261600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2261700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2261800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2261900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2262000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2262100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2262200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2262300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2262400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2262500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2262600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2262700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2262800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2262900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2263000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2263100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2263200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2263300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2263400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2263500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2263600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2263700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2263800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2263900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2264000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2264100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2264200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2264300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2264400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2264500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2264600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2264700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2264800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2264900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2265000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2265100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2265200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2265300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2265400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2265500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2265600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2265700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2265800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2265900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2266000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2266100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2266200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2266300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2266400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2266500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2266600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2266700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2266800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2266900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2267000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2267100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2267200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2267300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2267400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2267500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2267600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2267700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2267800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2267900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2268000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2268100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2268200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2268300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2268400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2268500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2268600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2268700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2268800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2268900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2269000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2269100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2269200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2269300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2269400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2269500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2269600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2269700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2269800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2269900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2270000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2270100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2270200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2270300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2270400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2270500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2270600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2270700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2270800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2270900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2271000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2271100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2271200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2271300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2271400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2271500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2271600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2271700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2271800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2271900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2272000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2272100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2272200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2272300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2272400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2272500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2272600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2272700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2272800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2272900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2273000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2273100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2273200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2273300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2273400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2273500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2273600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2273700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2273800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2273900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2274000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2274100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2274200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 2274300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 2274400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2274500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2274600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2274700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2274800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2274900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2275000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2275100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2275200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2275300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2275400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2275500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2275600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2275700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2275800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2275900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2276000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2276100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2276200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2276300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2276400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2276500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2276600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2276700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2276800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2276900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 2277000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2277100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2277200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2277300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2277400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2277500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2277600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2277700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2277800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2277900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2278000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2278100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2278200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2278300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2278400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2278500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2278600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2278700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2278800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2278900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2279000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2279100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2279200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2279300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2279400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2279500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2279600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2279700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2279800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2279900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2280000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028411865234375, - "eval_runtime": 3129.8443, - "eval_samples_per_second": 359.354, - "eval_steps_per_second": 22.46, - "step": 2280000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2280100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2280200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2280300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2280400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2280500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2280600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2280700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2280800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2280900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2281000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2281100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2281200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2281300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2281400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2281500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2281600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2281700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2281800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2281900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2282000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2282100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2282200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2282300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2282400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2282500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2282600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2282700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2282800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2282900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2283000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2283100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2283200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2283300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2283400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2283500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2283600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2283700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2283800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2283900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2284000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2284100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2284200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2284300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2284400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2284500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2284600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2284700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2284800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2284900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2285000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2285100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2285200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2285300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2285400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2285500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2285600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2285700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2285800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2285900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2286000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2286100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2286200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2286300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2286400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2286500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2286600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2286700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2286800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2286900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2287000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2287100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2287200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 2287300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2287400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2287500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2287600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2287700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2287800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2287900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 2288000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2288100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2288200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2288300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2288400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2288500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2288600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2288700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2288800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2288900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2289000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2289100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2289200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2289300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2289400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2289500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2289600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2289700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2289800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2289900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2290000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2290100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2290200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2290300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2290400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2290500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2290600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2290700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2290800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2290900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2291000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2291100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2291200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2291300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2291400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2291500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2291600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2291700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2291800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2291900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2292000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2292100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2292200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2292300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2292400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2292500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2292600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2292700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2292800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2292900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2293000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2293100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2293200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2293300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2293400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2293500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2293600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2293700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2293800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2293900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2294000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2294100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2294200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2294300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2294400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2294500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2294600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2294700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2294800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2294900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2295000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2295100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2295200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2295300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2295400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2295500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2295600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2295700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2295800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2295900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2296000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2296100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2296200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2296300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2296400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2296500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2296600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2296700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2296800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2296900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2297000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2297100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2297200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2297300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2297400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2297500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2297600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2297700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2297800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2297900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2298000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2298100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2298200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2298300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2298400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2298500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2298600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2298700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2298800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2298900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2299000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2299100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2299200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2299300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2299400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2299500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2299600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2299700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2299800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2299900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2300000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0286712646484375, - "eval_runtime": 3485.3801, - "eval_samples_per_second": 322.697, - "eval_steps_per_second": 20.169, - "step": 2300000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2300100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2300200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2300300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2300400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2300500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2300600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2300700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2300800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2300900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2301000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2301100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2301200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2301300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2301400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2301500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2301600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2301700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2301800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2301900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2302000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2302100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2302200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2302300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2302400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2302500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2302600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2302700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2302800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2302900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2303000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2303100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2303200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2303300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2303400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2303500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2303600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2303700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2303800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2303900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2304000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2304100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2304200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2304300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2304400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2304500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2304600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2304700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2304800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2304900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2305000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2305100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2305200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2305300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2305400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2305500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2305600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2305700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2305800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2305900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2306000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2306100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2306200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2306300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2306400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2306500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2306600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2306700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2306800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2306900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2307000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2307100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0444, - "step": 2307200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2307300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2307400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2307500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2307600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2307700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2307800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2307900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2308000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2308100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2308200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2308300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2308400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2308500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2308600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2308700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2308800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2308900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2309000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2309100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2309200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2309300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2309400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2309500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2309600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2309700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2309800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2309900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2310000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2310100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2310200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2310300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2310400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2310500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2310600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2310700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2310800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2310900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2311000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2311100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2311200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2311300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2311400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2311500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2311600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2311700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2311800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2311900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2312000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 2312100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2312200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2312300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2312400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2312500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2312600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2312700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2312800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2312900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2313000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2313100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2313200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2313300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2313400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2313500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2313600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2313700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2313800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2313900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2314000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2314100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2314200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2314300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2314400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2314500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2314600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2314700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2314800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2314900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2315000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2315100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2315200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2315300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2315400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2315500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2315600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2315700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2315800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2315900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2316000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2316100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2316200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2316300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2316400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2316500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2316600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2316700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2316800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2316900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2317000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2317100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2317200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2317300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2317400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2317500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2317600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2317700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2317800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2317900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2318000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2318100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2318200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2318300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2318400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2318500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2318600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2318700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2318800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2318900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2319000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2319100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2319200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2319300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2319400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2319500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2319600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2319700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2319800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2319900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2320000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284271240234375, - "eval_runtime": 3334.3289, - "eval_samples_per_second": 337.316, - "eval_steps_per_second": 21.083, - "step": 2320000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2320100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2320200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2320300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2320400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2320500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2320600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2320700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2320800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2320900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2321000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2321100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2321200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2321300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2321400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2321500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2321600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2321700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2321800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2321900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2322000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2322100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2322200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2322300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2322400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2322500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2322600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2322700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2322800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2322900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2323000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2323100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2323200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2323300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2323400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2323500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2323600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2323700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2323800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2323900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2324000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2324100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2324200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2324300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2324400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2324500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2324600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2324700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2324800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2324900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2325000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2325100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2325200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2325300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2325400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2325500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2325600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2325700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2325800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2325900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2326000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2326100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2326200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2326300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2326400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2326500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2326600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2326700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2326800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2326900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2327000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2327100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2327200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2327300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2327400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2327500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2327600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2327700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2327800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2327900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2328000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2328100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2328200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2328300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2328400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2328500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2328600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2328700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2328800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2328900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2329000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2329100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2329200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2329300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2329400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2329500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2329600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2329700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2329800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2329900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2330000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2330100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2330200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2330300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2330400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2330500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2330600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2330700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2330800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2330900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2331000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2331100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2331200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2331300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2331400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2331500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2331600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2331700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2331800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2331900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2332000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2332100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2332200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2332300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2332400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2332500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2332600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2332700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2332800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2332900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2333000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2333100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2333200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2333300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2333400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2333500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2333600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2333700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2333800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2333900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2334000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2334100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2334200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2334300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2334400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2334500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2334600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2334700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2334800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2334900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2335000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2335100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2335200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2335300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2335400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2335500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2335600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2335700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2335800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2335900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2336000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2336100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2336200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2336300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2336400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2336500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2336600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2336700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2336800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2336900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2337000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2337100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2337200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2337300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2337400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2337500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0286, - "step": 2337600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2337700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2337800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2337900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2338000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2338100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2338200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2338300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2338400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2338500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2338600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2338700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2338800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2338900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2339000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2339100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2339200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2339300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2339400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2339500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2339600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2339700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2339800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2339900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2340000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283966064453125, - "eval_runtime": 3291.434, - "eval_samples_per_second": 341.712, - "eval_steps_per_second": 21.357, - "step": 2340000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2340100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2340200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2340300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2340400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2340500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2340600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2340700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2340800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2340900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2341000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2341100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2341200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2341300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2341400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2341500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2341600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2341700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2341800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2341900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2342000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2342100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2342200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2342300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2342400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2342500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2342600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2342700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2342800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2342900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2343000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2343100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2343200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2343300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2343400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2343500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2343600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2343700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2343800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2343900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0365, - "step": 2344000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2344100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2344200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2344300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2344400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2344500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2344600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2344700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2344800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2344900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2345000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2345100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2345200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2345300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2345400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2345500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2345600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2345700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2345800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2345900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2346000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2346100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2346200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2346300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2346400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2346500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2346600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2346700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2346800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2346900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2347000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2347100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2347200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2347300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2347400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2347500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2347600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2347700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2347800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2347900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2348000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2348100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2348200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2348300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2348400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2348500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2348600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2348700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2348800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2348900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2349000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2349100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2349200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2349300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2349400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2349500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2349600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2349700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2349800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2349900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2350000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2350100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2350200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2350300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2350400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2350500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2350600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2350700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2350800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2350900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2351000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2351100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2351200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2351300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2351400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2351500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2351600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2351700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2351800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2351900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2352000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2352100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2352200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2352300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2352400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2352500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2352600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2352700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2352800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2352900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2353000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2353100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2353200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2353300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2353400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2353500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2353600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2353700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2353800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2353900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2354000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2354100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2354200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2354300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2354400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2354500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2354600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2354700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2354800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2354900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2355000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2355100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2355200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0791, - "step": 2355300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2355400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2355500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2355600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2355700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2355800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2355900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2356000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2356100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2356200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2356300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2356400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2356500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2356600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2356700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2356800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2356900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2357000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2357100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2357200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2357300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2357400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2357500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2357600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2357700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2357800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2357900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2358000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2358100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2358200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2358300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2358400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2358500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2358600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2358700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2358800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2358900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2359000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2359100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2359200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2359300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2359400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2359500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2359600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2359700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2359800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2359900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2360000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284271240234375, - "eval_runtime": 3552.3735, - "eval_samples_per_second": 316.612, - "eval_steps_per_second": 19.788, - "step": 2360000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2360100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2360200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2360300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2360400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2360500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2360600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2360700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2360800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2360900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2361000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2361100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2361200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2361300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2361400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2361500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2361600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2361700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2361800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2361900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2362000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2362100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2362200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2362300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2362400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2362500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2362600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2362700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2362800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2362900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2363000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2363100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2363200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2363300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2363400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2363500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2363600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2363700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2363800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2363900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2364000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2364100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2364200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2364300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2364400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2364500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2364600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2364700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2364800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2364900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2365000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2365100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2365200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2365300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0374, - "step": 2365400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2365500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2365600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2365700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2365800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2365900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2366000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2366100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2366200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2366300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2366400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2366500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2366600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2366700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2366800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2366900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2367000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2367100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2367200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2367300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2367400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2367500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2367600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2367700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2367800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2367900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2368000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2368100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2368200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2368300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2368400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2368500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2368600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2368700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2368800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2368900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2369000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2369100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2369200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2369300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2369400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2369500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2369600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2369700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2369800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2369900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2370000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2370100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2370200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2370300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2370400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2370500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2370600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2370700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2370800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2370900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2371000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2371100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2371200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2371300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2371400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2371500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2371600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2371700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2371800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2371900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2372000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2372100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2372200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2372300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2372400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2372500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2372600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2372700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2372800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2372900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2373000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2373100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2373200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2373300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2373400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2373500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2373600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2373700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2373800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2373900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2374000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2374100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2374200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2374300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2374400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2374500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2374600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2374700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2374800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2374900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2375000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2375100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0356, - "step": 2375200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2375300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2375400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2375500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2375600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2375700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2375800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2375900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2376000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2376100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2376200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2376300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2376400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2376500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2376600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2376700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2376800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2376900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2377000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2377100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2377200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2377300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2377400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2377500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2377600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2377700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2377800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2377900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2378000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2378100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2378200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2378300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2378400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2378500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2378600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2378700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2378800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2378900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2379000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2379100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2379200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2379300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2379400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2379500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2379600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2379700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2379800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2379900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2380000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028656005859375, - "eval_runtime": 3360.1297, - "eval_samples_per_second": 334.726, - "eval_steps_per_second": 20.921, - "step": 2380000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2380100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2380200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2380300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2380400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2380500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2380600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2380700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2380800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2380900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2381000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2381100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2381200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2381300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2381400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2381500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2381600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2381700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2381800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2381900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2382000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2382100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2382200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2382300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2382400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2382500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2382600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2382700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2382800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2382900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2383000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2383100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2383200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2383300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2383400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2383500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2383600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2383700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2383800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2383900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2384000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2384100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2384200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2384300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2384400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2384500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2384600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2384700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2384800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2384900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2385000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2385100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2385200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2385300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2385400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2385500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2385600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2385700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2385800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2385900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2386000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2386100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2386200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2386300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2386400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2386500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2386600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2386700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2386800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2386900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2387000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2387100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2387200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2387300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2387400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2387500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2387600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2387700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2387800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2387900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2388000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2388100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2388200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2388300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2388400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2388500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2388600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2388700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2388800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2388900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2389000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2389100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2389200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2389300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2389400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2389500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2389600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2389700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2389800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2389900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2390000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2390100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2390200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2390300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2390400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2390500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2390600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2390700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2390800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2390900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2391000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2391100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2391200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2391300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2391400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2391500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2391600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2391700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2391800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2391900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2392000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2392100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2392200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2392300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2392400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2392500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2392600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2392700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2392800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2392900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2393000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2393100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2393200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2393300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2393400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2393500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2393600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2393700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2393800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2393900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2394000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2394100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2394200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2394300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2394400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2394500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2394600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2394700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2394800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2394900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2395000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2395100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2395200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2395300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2395400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2395500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2395600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2395700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2395800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2395900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2396000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2396100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2396200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2396300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2396400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2396500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2396600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2396700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2396800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2396900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2397000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2397100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2397200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2397300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2397400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2397500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2397600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2397700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2397800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2397900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2398000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2398100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2398200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2398300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2398400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2398500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2398600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2398700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2398800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2398900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2399000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2399100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2399200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2399300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2399400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2399500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2399600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2399700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2399800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2399900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2400000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284881591796875, - "eval_runtime": 3271.9298, - "eval_samples_per_second": 343.749, - "eval_steps_per_second": 21.485, - "step": 2400000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2400100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2400200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2400300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2400400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2400500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2400600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2400700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2400800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2400900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2401000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2401100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2401200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2401300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2401400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2401500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2401600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2401700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2401800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2401900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2402000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2402100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2402200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2402300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2402400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2402500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2402600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2402700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2402800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2402900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2403000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2403100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2403200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2403300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2403400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2403500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2403600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2403700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2403800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2403900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 2404000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2404100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2404200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2404300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2404400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2404500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2404600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2404700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2404800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2404900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2405000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2405100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2405200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2405300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2405400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2405500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2405600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2405700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2405800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2405900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2406000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2406100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2406200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2406300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2406400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2406500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2406600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2406700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2406800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2406900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2407000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2407100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2407200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2407300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2407400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2407500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2407600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2407700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2407800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2407900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2408000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2408100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2408200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2408300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2408400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2408500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2408600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2408700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2408800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2408900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2409000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2409100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2409200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2409300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2409400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2409500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2409600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2409700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2409800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2409900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2410000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2410100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2410200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2410300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2410400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2410500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2410600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2410700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2410800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2410900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2411000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2411100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2411200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2411300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2411400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2411500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2411600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2411700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2411800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2411900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2412000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2412100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2412200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2412300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2412400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2412500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2412600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2412700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2412800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2412900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2413000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2413100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2413200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2413300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2413400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2413500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2413600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2413700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2413800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2413900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2414000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2414100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2414200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2414300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2414400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2414500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2414600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2414700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2414800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2414900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2415000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2415100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2415200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2415300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2415400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2415500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2415600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2415700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2415800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2415900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2416000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2416100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2416200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2416300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2416400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2416500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2416600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2416700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2416800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2416900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2417000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2417100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2417200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2417300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2417400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2417500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2417600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2417700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2417800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2417900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2418000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2418100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2418200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2418300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2418400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2418500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2418600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2418700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2418800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2418900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2419000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2419100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2419200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2419300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2419400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2419500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2419600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2419700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2419800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2419900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2420000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0282440185546875, - "eval_runtime": 3824.0528, - "eval_samples_per_second": 294.118, - "eval_steps_per_second": 18.383, - "step": 2420000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2420100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2420200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2420300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2420400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2420500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2420600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2420700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2420800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2420900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2421000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2421100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2421200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2421300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2421400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2421500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2421600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2421700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2421800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2421900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2422000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2422100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2422200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2422300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2422400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2422500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2422600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2422700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2422800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2422900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2423000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2423100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2423200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2423300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2423400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2423500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2423600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2423700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2423800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2423900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2424000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2424100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2424200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2424300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2424400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2424500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2424600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2424700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2424800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2424900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2425000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2425100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2425200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2425300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2425400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2425500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2425600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2425700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2425800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2425900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2426000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2426100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2426200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2426300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2426400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2426500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2426600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2426700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2426800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2426900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2427000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2427100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2427200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2427300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2427400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2427500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2427600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2427700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2427800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2427900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2428000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2428100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2428200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2428300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2428400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2428500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2428600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2428700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2428800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2428900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2429000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2429100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2429200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2429300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2429400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2429500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2429600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2429700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2429800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2429900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2430000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2430100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2430200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2430300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2430400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2430500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2430600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2430700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2430800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2430900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2431000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2431100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2431200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2431300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2431400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2431500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2431600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2431700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2431800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2431900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2432000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2432100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2432200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2432300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2432400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2432500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2432600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2432700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2432800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2432900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2433000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2433100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2433200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2433300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2433400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2433500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2433600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2433700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2433800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2433900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2434000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2434100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2434200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2434300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2434400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2434500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2434600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2434700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2434800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2434900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2435000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2435100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2435200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2435300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2435400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2435500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2435600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2435700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2435800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2435900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2436000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2436100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2436200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2436300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2436400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2436500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2436600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2436700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2436800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2436900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2437000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2437100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2437200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2437300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2437400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2437500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2437600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2437700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2437800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2437900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2438000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2438100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2438200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2438300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 2438400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2438500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2438600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2438700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2438800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2438900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2439000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2439100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2439200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2439300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2439400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2439500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2439600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2439700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2439800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2439900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2440000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283966064453125, - "eval_runtime": 3407.4332, - "eval_samples_per_second": 330.079, - "eval_steps_per_second": 20.63, - "step": 2440000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2440100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2440200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2440300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2440400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2440500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2440600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2440700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2440800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2440900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2441000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2441100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2441200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2441300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2441400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2441500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2441600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2441700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2441800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2441900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2442000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2442100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2442200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2442300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2442400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2442500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2442600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2442700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2442800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2442900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2443000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2443100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2443200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2443300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2443400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2443500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2443600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2443700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2443800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2443900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2444000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2444100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2444200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2444300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2444400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2444500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2444600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2444700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2444800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2444900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2445000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2445100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2445200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2445300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2445400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2445500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2445600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2445700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2445800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2445900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2446000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2446100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2446200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2446300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2446400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2446500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2446600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2446700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2446800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2446900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2447000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2447100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2447200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2447300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2447400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2447500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2447600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2447700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2447800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2447900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2448000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2448100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2448200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2448300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2448400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2448500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2448600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2448700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2448800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2448900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2449000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2449100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2449200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2449300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2449400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2449500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2449600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2449700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2449800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2449900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2450000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2450100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2450200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2450300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2450400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2450500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2450600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2450700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2450800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2450900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2451000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2451100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2451200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2451300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2451400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2451500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2451600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2451700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2451800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2451900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2452000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2452100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2452200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2452300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2452400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2452500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2452600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2452700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2452800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2452900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2453000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2453100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2453200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2453300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0286, - "step": 2453400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2453500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2453600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2453700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2453800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2453900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2454000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2454100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2454200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2454300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2454400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2454500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2454600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2454700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2454800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2454900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2455000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2455100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2455200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2455300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2455400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2455500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2455600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2455700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2455800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2455900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2456000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2456100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2456200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2456300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2456400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2456500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2456600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2456700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2456800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2456900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2457000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2457100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2457200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2457300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2457400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2457500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2457600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2457700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2457800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2457900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2458000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0767, - "step": 2458100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0451, - "step": 2458200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2458300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2458400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2458500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2458600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2458700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2458800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2458900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2459000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2459100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2459200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2459300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2459400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 2459500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2459600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2459700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2459800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2459900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0481, - "step": 2460000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0302581787109375, - "eval_runtime": 3388.8848, - "eval_samples_per_second": 331.886, - "eval_steps_per_second": 20.743, - "step": 2460000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2460100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2460200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2460300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2460400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2460500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2460600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2460700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2460800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2460900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2461000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2461100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2461200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2461300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2461400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 2461500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2461600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2461700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2461800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2461900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2462000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2462100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2462200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2462300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2462400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2462500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2462600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2462700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 2462800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2462900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2463000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2463100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2463200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2463300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2463400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2463500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2463600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2463700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2463800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2463900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 2464000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2464100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 2464200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2464300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2464400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2464500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2464600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2464700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2464800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2464900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2465000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2465100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2465200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2465300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2465400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2465500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2465600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2465700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2465800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 2465900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2466000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2466100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2466200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2466300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2466400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2466500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2466600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2466700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2466800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2466900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2467000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2467100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2467200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2467300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2467400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2467500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2467600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2467700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2467800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2467900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2468000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2468100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2468200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2468300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2468400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2468500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2468600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2468700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 2468800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2468900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2469000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2469100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2469200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2469300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2469400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2469500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2469600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2469700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2469800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2469900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2470000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2470100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2470200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2470300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2470400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2470500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2470600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2470700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2470800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2470900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2471000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2471100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2471200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2471300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2471400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2471500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2471600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2471700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2471800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2471900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2472000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 2472100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 2472200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2472300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2472400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2472500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2472600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2472700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2472800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2472900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2473000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2473100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 2473200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2473300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2473400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2473500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2473600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2473700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2473800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2473900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2474000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2474100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2474200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2474300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2474400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2474500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2474600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2474700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2474800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2474900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2475000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2475100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2475200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2475300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2475400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0593, - "step": 2475500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2475600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2475700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2475800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2475900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2476000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2476100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2476200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2476300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2476400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2476500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2476600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2476700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2476800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2476900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2477000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2477100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2477200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2477300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2477400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2477500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2477600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2477700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2477800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2477900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2478000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2478100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2478200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2478300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2478400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2478500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2478600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2478700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2478800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2478900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2479000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2479100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2479200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2479300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2479400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0436, - "step": 2479500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2479600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2479700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2479800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2479900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2480000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283966064453125, - "eval_runtime": 3390.2297, - "eval_samples_per_second": 331.754, - "eval_steps_per_second": 20.735, - "step": 2480000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2480100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2480200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2480300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2480400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0386, - "step": 2480500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 2480600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0454, - "step": 2480700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 2480800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2480900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2481000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2481100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2481200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2481300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2481400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2481500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2481600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2481700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2481800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2481900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2482000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2482100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2482200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2482300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2482400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2482500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2482600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2482700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2482800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2482900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2483000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2483100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2483200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2483300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2483400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2483500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2483600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2483700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2483800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2483900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2484000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2484100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2484200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2484300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2484400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2484500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2484600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2484700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2484800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2484900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2485000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2485100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2485200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2485300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2485400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2485500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2485600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2485700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2485800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2485900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2486000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2486100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2486200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2486300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2486400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2486500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2486600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2486700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2486800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2486900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2487000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2487100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2487200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2487300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2487400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2487500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2487600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2487700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2487800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2487900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2488000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2488100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2488200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2488300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2488400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2488500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2488600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2488700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2488800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2488900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2489000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2489100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2489200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2489300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2489400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2489500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2489600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2489700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2489800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2489900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2490000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2490100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2490200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2490300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2490400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2490500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2490600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2490700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2490800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2490900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2491000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0401, - "step": 2491100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2491200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2491300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2491400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2491500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2491600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2491700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2491800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2491900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2492000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2492100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2492200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2492300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2492400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2492500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2492600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2492700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2492800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2492900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2493000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2493100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2493200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2493300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2493400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2493500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2493600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2493700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2493800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2493900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2494000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0405, - "step": 2494100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1806, - "step": 2494200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2494300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2494400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2494500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2494600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2494700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2494800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2494900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2495000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2495100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2495200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2495300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2495400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2495500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2495600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2495700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2495800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2495900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2496000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2496100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2496200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2496300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2496400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2496500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2496600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2496700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2496800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2496900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2497000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2497100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2497200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2497300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2497400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2497500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2497600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2497700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2497800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2497900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2498000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2498100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2498200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2498300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2498400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2498500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2498600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2498700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2498800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2498900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2499000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2499100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2499200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2499300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2499400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2499500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2499600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2499700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2499800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2499900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2500000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0286407470703125, - "eval_runtime": 3397.6229, - "eval_samples_per_second": 331.032, - "eval_steps_per_second": 20.69, - "step": 2500000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2500100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2500200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2500300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2500400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2500500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2500600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2500700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2500800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2500900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2501000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2501100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2501200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2501300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2501400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2501500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2501600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2501700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2501800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2501900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2502000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2502100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2502200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2502300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2502400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2502500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2502600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2502700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2502800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2502900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2503000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2503100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2503200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2503300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2503400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2503500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2503600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2503700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2503800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2503900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2504000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2504100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2504200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2504300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2504400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2504500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2504600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2504700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2504800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2504900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2505000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2505100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2505200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2505300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2505400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2505500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2505600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2505700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2505800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2505900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2506000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2506100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2506200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2506300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2506400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2506500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2506600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2506700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2506800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2506900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2507000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2507100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2507200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2507300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2507400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2507500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2507600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2507700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2507800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2507900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2508000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2508100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2508200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2508300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2508400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2508500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2508600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2508700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2508800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2508900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2509000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2509100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2509200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2509300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2509400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2509500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2509600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2509700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2509800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2509900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2510000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2510100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2510200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2510300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2510400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2510500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2510600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2510700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2510800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2510900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2511000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2511100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2511200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2511300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2511400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2511500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2511600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2511700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2511800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2511900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2512000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2512100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2512200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0285, - "step": 2512300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2512400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2512500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2512600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2512700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2512800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2512900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2513000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2513100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2513200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2513300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2513400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2513500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2513600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2513700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2513800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2513900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2514000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2514100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2514200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2514300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2514400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2514500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2514600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2514700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2514800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2514900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2515000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0531, - "step": 2515100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2515200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2515300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2515400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2515500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2515600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2515700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2515800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2515900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2516000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2516100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2516200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2516300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2516400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2516500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2516600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2516700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2516800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2516900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2517000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 2517100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2517200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2517300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2517400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2517500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2517600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2517700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2517800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2517900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2518000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2518100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2518200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2518300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2518400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2518500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2518600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2518700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2518800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2518900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2519000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2519100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2519200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2519300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2519400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2519500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2519600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2519700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2519800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2519900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2520000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0289154052734375, - "eval_runtime": 3357.6743, - "eval_samples_per_second": 334.971, - "eval_steps_per_second": 20.936, - "step": 2520000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2520100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2520200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2520300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2520400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2520500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2520600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2520700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2520800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2520900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2521000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2521100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2521200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2521300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2521400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2521500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2521600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2521700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2521800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2521900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2522000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2522100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2522200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2522300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2522400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2522500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2522600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2522700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2522800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2522900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2523000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2523100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2523200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2523300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2523400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2523500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2523600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2523700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2523800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2523900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2524000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2524100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2524200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2524300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2524400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2524500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2524600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2524700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2524800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2524900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2525000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2525100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2525200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2525300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0736, - "step": 2525400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2525500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2525600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2525700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2525800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2525900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2526000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2526100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2526200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2526300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2526400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2526500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2526600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2526700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2526800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2526900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2527000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2527100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2527200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2527300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2527400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2527500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2527600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2527700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2527800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2527900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2528000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2528100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2528200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2528300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2528400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2528500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2528600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2528700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2528800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2528900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2529000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2529100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2529200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2529300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2529400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2529500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2529600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2529700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2529800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2529900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2530000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2530100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2530200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2530300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2530400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2530500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2530600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2530700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2530800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2530900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2531000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2531100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2531200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2531300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2531400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2531500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2531600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2531700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2531800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2531900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2532000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2532100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2532200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2532300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2532400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2532500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2532600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2532700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2532800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2532900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2533000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2533100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2533200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2533300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2533400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2533500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2533600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2533700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2533800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2533900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2534000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2534100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2534200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2534300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2534400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2534500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2534600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2534700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2534800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2534900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2535000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2535100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2535200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2535300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2535400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2535500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2535600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2535700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2535800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2535900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2536000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2536100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2536200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2536300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2536400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2536500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2536600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2536700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2536800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2536900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2537000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2537100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2537200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2537300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2537400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2537500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2537600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2537700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2537800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2537900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2538000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2538100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0418, - "step": 2538200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2538300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2538400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2538500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2538600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2538700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2538800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2538900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2539000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2539100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2539200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2539300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2539400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2539500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2539600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2539700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2539800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2539900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2540000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0288848876953125, - "eval_runtime": 3563.5068, - "eval_samples_per_second": 315.623, - "eval_steps_per_second": 19.727, - "step": 2540000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2540100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2540200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2540300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2540400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2540500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2540600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2540700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2540800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2540900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2541000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2541100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2541200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2541300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2541400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2541500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2541600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2541700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2541800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2541900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2542000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0352, - "step": 2542100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2542200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2542300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2542400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2542500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2542600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2542700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2542800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2542900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2543000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2543100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2543200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2543300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2543400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2543500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2543600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2543700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2543800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2543900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2544000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2544100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2544200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2544300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2544400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2544500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2544600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2544700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2544800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2544900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2545000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2545100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2545200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2545300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2545400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2545500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2545600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2545700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2545800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2545900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2546000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2546100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2546200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2546300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2546400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2546500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2546600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2546700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2546800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2546900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2547000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2547100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2547200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2547300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2547400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2547500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2547600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2547700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2547800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2547900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2548000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2548100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2548200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2548300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2548400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2548500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2548600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2548700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2548800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2548900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2549000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2549100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2549200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2549300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2549400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2549500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2549600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2549700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2549800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2549900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2550000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2550100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2550200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2550300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2550400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2550500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2550600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2550700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2550800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2550900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2551000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2551100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2551200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2551300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2551400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2551500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2551600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2551700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2551800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2551900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2552000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2552100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2552200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2552300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2552400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2552500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2552600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2552700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2552800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2552900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2553000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2553100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2553200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2553300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2553400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2553500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2553600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2553700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2553800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2553900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2554000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2554100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2554200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2554300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2554400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2554500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2554600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2554700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2554800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2554900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2555000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2555100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2555200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2555300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2555400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2555500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2555600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2555700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2555800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2555900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2556000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2556100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2556200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2556300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2556400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2556500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2556600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2556700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2556800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2556900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2557000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2557100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2557200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2557300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2557400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2557500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2557600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2557700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2557800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2557900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2558000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2558100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2558200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2558300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2558400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2558500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2558600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2558700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2558800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2558900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2559000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2559100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2559200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2559300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2559400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2559500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2559600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2559700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2559800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2559900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2560000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028411865234375, - "eval_runtime": 3241.7894, - "eval_samples_per_second": 346.945, - "eval_steps_per_second": 21.684, - "step": 2560000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2560100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2560200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2560300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2560400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2560500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2560600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2560700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2560800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2560900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2561000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2561100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2561200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2561300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2561400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2561500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2561600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2561700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2561800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2561900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2562000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2562100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2562200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2562300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2562400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2562500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2562600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2562700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2562800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2562900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2563000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2563100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2563200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2563300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2563400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2563500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2563600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2563700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2563800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2563900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2564000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2564100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2564200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2564300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2564400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2564500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2564600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2564700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2564800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2564900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2565000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2565100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2565200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2565300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2565400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2565500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2565600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2565700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2565800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2565900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2566000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2566100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2566200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2566300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2566400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2566500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2566600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2566700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2566800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2566900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2567000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2567100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2567200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2567300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2567400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2567500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2567600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2567700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2567800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2567900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2568000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2568100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2568200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2568300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2568400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2568500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2568600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2568700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2568800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2568900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2569000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2569100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2569200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2569300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2569400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2569500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2569600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2569700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2569800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2569900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2570000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2570100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2570200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2570300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2570400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2570500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2570600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2570700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2570800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2570900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2571000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2571100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2571200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2571300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2571400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2571500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2571600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2571700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2571800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2571900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2572000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2572100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2572200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2572300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2572400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2572500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2572600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2572700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2572800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2572900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2573000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2573100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2573200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2573300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2573400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2573500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2573600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2573700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2573800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2573900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2574000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2574100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2574200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2574300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2574400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2574500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2574600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2574700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2574800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2574900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2575000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2575100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2575200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2575300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2575400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2575500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2575600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2575700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2575800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2575900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2576000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2576100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2576200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2576300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2576400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2576500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2576600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2576700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2576800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2576900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2577000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2577100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2577200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2577300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2577400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2577500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2577600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2577700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2577800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2577900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2578000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2578100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2578200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2578300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2578400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2578500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2578600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2578700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2578800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2578900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2579000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2579100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2579200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2579300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2579400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2579500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2579600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2579700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2579800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2579900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2580000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283966064453125, - "eval_runtime": 3042.109, - "eval_samples_per_second": 369.718, - "eval_steps_per_second": 23.108, - "step": 2580000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2580100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2580200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2580300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2580400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2580500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2580600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2580700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2580800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2580900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2581000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2581100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2581200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2581300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2581400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2581500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2581600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2581700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2581800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2581900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2582000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2582100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2582200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2582300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2582400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2582500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2582600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2582700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2582800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2582900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2583000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2583100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2583200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2583300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2583400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2583500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2583600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2583700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2583800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2583900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 2584000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2584100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2584200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2584300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2584400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2584500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2584600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2584700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2584800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2584900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2585000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2585100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2585200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2585300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2585400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2585500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2585600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2585700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2585800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2585900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2586000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2586100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2586200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2586300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2586400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2586500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2586600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2586700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2586800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2586900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2587000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2587100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2587200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2587300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2587400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2587500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2587600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2587700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2587800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2587900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2588000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2588100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2588200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2588300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2588400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2588500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0284, - "step": 2588600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2588700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2588800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2588900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2589000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2589100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2589200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2589300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2589400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2589500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2589600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2589700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2589800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2589900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2590000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2590100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2590200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2590300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2590400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2590500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2590600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2590700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2590800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2590900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2591000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2591100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2591200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2591300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2591400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2591500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2591600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2591700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2591800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2591900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2592000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2592100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2592200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2592300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2592400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2592500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2592600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2592700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2592800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2592900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2593000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2593100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2593200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2593300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2593400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2593500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2593600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2593700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2593800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2593900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2594000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2594100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2594200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2594300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2594400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2594500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2594600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2594700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2594800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2594900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2595000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2595100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2595200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2595300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2595400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2595500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2595600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2595700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2595800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2595900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2596000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2596100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2596200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2596300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2596400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2596500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2596600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2596700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2596800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2596900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2597000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2597100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2597200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2597300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2597400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2597500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0398, - "step": 2597600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 2597700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2597800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2597900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2598000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2598100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2598200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2598300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2598400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2598500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2598600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2598700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2598800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2598900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2599000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2599100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2599200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2599300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2599400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2599500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2599600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2599700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2599800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2599900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2600000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0279998779296875, - "eval_runtime": 3070.9376, - "eval_samples_per_second": 366.247, - "eval_steps_per_second": 22.891, - "step": 2600000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2600100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2600200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2600300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2600400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2600500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2600600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2600700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2600800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2600900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2601000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2601100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2601200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2601300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2601400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2601500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2601600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0403, - "step": 2601700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2601800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2601900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2602000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2602100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2602200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2602300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2602400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2602500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2602600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2602700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2602800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2602900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2603000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2603100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2603200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2603300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2603400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2603500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2603600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2603700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2603800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2603900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2604000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2604100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2604200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2604300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2604400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2604500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2604600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2604700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2604800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2604900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2605000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2605100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2605200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2605300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2605400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2605500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2605600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2605700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2605800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2605900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2606000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2606100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2606200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2606300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2606400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2606500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2606600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2606700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2606800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2606900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2607000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2607100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2607200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2607300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2607400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2607500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2607600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2607700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2607800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2607900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2608000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2608100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2608200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2608300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2608400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2608500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2608600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2608700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2608800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2608900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2609000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2609100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2609200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2609300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2609400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2609500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2609600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2609700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2609800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2609900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2610000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2610100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2610200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2610300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2610400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2610500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2610600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2610700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2610800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2610900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2611000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2611100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2611200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2611300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2611400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2611500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2611600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2611700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2611800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0472, - "step": 2611900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2612000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2612100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2612200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2612300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2612400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2612500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2612600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2612700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2612800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2612900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2613000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2613100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2613200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2613300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2613400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2613500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2613600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2613700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2613800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2613900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2614000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2614100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2614200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2614300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2614400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2614500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2614600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2614700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2614800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2614900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2615000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2615100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2615200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2615300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2615400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2615500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2615600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2615700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2615800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2615900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2616000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2616100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2616200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2616300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2616400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2616500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2616600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2616700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2616800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2616900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2617000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2617100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2617200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2617300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2617400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2617500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2617600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2617700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2617800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2617900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2618000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 2618100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2618200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2618300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.038, - "step": 2618400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2618500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2618600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2618700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2618800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2618900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2619000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2619100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2619200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2619300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2619400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2619500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2619600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2619700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2619800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2619900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2620000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028076171875, - "eval_runtime": 3158.1346, - "eval_samples_per_second": 356.135, - "eval_steps_per_second": 22.259, - "step": 2620000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2620100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2620200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2620300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2620400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2620500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2620600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2620700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2620800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2620900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2621000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2621100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2621200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2621300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2621400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2621500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2621600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2621700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2621800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2621900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2622000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2622100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2622200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2622300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2622400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2622500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2622600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2622700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2622800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2622900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2623000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2623100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2623200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 2623300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2623400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2623500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2623600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2623700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2623800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2623900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2624000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2624100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2624200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2624300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0367, - "step": 2624400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2624500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2624600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2624700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2624800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2624900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2625000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2625100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2625200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2625300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2625400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2625500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2625600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2625700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2625800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2625900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2626000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2626100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2626200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2626300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2626400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2626500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2626600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2626700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2626800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2626900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2627000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2627100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2627200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2627300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2627400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2627500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2627600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2627700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2627800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2627900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2628000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2628100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2628200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2628300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2628400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2628500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2628600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2628700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2628800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2628900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2629000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2629100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2629200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2629300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2629400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2629500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2629600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2629700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2629800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2629900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2630000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2630100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2630200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2630300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2630400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2630500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2630600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2630700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2630800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2630900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2631000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2631100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2631200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2631300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2631400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2631500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2631600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2631700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2631800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2631900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2632000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 2632100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2632200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2632300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2632400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2632500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2632600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2632700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2632800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2632900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2633000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2633100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0286, - "step": 2633200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2633300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2633400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2633500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2633600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2633700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2633800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2633900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2634000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2634100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2634200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2634300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2634400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2634500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2634600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2634700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2634800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2634900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2635000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2635100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2635200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2635300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2635400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2635500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2635600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2635700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2635800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2635900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2636000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2636100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2636200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2636300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2636400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2636500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2636600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2636700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2636800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2636900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2637000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2637100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2637200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2637300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2637400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2637500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2637600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2637700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2637800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2637900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2638000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2638100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2638200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2638300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2638400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2638500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2638600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2638700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2638800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2638900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2639000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2639100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2639200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2639300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2639400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2639500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2639600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2639700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2639800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2639900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2640000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0287933349609375, - "eval_runtime": 3157.0248, - "eval_samples_per_second": 356.26, - "eval_steps_per_second": 22.267, - "step": 2640000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2640100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2640200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2640300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2640400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2640500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2640600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2640700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2640800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2640900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2641000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2641100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2641200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2641300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2641400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2641500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2641600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2641700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2641800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2641900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2642000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2642100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2642200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2642300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2642400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2642500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2642600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2642700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2642800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2642900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2643000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2643100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2643200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2643300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2643400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2643500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2643600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2643700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2643800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2643900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2644000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2644100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.071, - "step": 2644200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2644300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2644400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2644500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2644600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2644700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2644800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2644900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2645000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2645100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2645200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2645300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2645400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2645500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2645600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2645700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2645800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2645900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2646000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2646100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2646200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2646300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2646400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2646500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2646600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2646700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2646800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2646900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2647000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2647100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2647200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2647300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2647400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2647500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2647600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2647700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2647800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2647900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2648000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2648100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 2648200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2648300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2648400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2648500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2648600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2648700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2648800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2648900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2649000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2649100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2649200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2649300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2649400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2649500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2649600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2649700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2649800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2649900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2650000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2650100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2650200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2650300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2650400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2650500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2650600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2650700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2650800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2650900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2651000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2651100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2651200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2651300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2651400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2651500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2651600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2651700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2651800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2651900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2652000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2652100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2652200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2652300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2652400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2652500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2652600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2652700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2652800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2652900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2653000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2653100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2653200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2653300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2653400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2653500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2653600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2653700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2653800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2653900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2654000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2654100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2654200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2654300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2654400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2654500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2654600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2654700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2654800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2654900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2655000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2655100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2655200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2655300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2655400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2655500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2655600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2655700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2655800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2655900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2656000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 2656100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2656200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2656300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2656400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2656500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2656600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2656700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2656800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2656900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2657000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2657100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2657200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2657300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2657400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2657500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2657600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2657700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2657800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2657900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2658000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2658100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2658200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2658300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2658400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2658500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2658600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2658700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2658800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2658900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2659000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2659100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2659200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2659300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2659400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2659500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2659600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2659700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2659800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2659900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2660000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283203125, - "eval_runtime": 3160.9552, - "eval_samples_per_second": 355.817, - "eval_steps_per_second": 22.239, - "step": 2660000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2660100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2660200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2660300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2660400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2660500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2660600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2660700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2660800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2660900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2661000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2661100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2661200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2661300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2661400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2661500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2661600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2661700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2661800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2661900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2662000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2662100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2662200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2662300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2662400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2662500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2662600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2662700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 2662800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0571, - "step": 2662900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2663000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2663100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2663200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2663300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2663400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2663500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2663600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2663700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2663800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2663900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2664000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2664100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2664200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2664300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2664400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2664500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2664600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2664700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2664800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2664900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2665000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2665100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2665200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2665300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2665400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2665500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2665600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2665700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2665800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2665900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2666000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2666100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2666200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2666300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2666400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2666500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2666600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2666700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2666800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2666900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2667000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2667100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2667200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2667300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2667400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2667500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2667600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2667700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2667800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2667900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2668000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2668100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2668200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2668300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2668400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2668500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2668600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2668700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2668800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2668900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2669000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2669100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2669200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2669300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2669400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2669500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2669600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2669700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2669800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2669900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2670000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2670100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2670200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2670300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2670400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2670500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2670600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2670700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2670800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2670900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2671000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2671100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2671200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2671300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2671400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2671500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2671600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2671700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2671800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2671900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2672000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2672100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2672200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2672300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2672400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2672500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2672600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2672700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2672800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2672900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2673000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2673100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2673200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2673300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2673400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2673500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2673600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2673700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2673800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2673900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2674000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2674100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2674200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2674300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2674400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2674500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2674600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2674700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2674800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2674900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2675000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2675100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2675200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2675300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2675400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2675500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2675600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2675700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2675800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2675900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2676000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2676100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2676200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2676300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2676400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2676500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2676600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2676700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2676800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2676900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2677000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2677100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2677200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2677300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2677400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2677500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2677600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2677700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2677800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2677900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2678000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2678100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2678200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2678300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2678400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2678500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2678600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2678700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2678800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2678900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2679000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2679100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2679200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2679300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2679400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2679500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2679600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2679700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2679800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2679900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2680000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0281524658203125, - "eval_runtime": 3109.0725, - "eval_samples_per_second": 361.755, - "eval_steps_per_second": 22.61, - "step": 2680000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2680100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2680200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2680300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2680400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2680500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2680600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2680700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2680800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2680900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2681000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2681100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2681200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2681300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2681400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2681500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2681600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2681700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2681800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2681900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2682000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2682100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2682200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2682300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2682400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2682500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2682600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2682700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2682800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2682900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2683000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2683100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2683200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2683300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2683400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2683500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2683600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2683700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2683800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2683900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2684000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2684100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2684200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2684300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2684400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2684500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2684600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2684700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2684800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2684900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2685000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2685100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2685200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2685300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2685400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2685500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2685600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2685700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2685800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2685900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2686000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2686100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2686200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2686300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2686400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2686500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2686600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2686700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2686800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 2686900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2687000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2687100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 2687200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2687300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2687400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2687500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2687600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2687700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2687800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2687900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2688000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2688100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2688200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2688300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2688400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2688500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2688600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2688700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2688800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2688900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2689000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2689100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2689200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2689300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2689400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2689500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2689600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2689700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2689800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2689900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2690000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2690100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2690200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2690300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2690400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2690500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2690600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2690700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2690800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2690900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2691000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2691100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2691200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2691300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2691400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2691500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2691600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2691700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2691800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2691900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2692000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2692100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2692200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2692300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2692400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2692500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2692600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2692700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2692800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2692900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2693000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2693100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2693200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2693300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2693400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2693500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2693600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2693700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2693800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2693900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2694000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2694100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2694200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2694300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2694400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2694500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2694600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2694700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2694800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2694900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2695000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2695100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2695200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2695300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2695400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2695500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2695600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2695700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2695800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2695900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2696000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2696100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2696200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2696300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2696400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2696500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2696600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2696700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2696800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2696900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2697000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2697100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2697200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2697300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2697400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2697500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2697600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2697700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2697800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2697900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2698000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2698100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2698200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2698300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2698400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2698500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2698600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2698700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2698800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2698900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2699000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2699100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2699200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2699300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2699400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2699500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2699600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2699700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2699800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2699900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2700000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028289794921875, - "eval_runtime": 3653.0406, - "eval_samples_per_second": 307.887, - "eval_steps_per_second": 19.243, - "step": 2700000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2700100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2700200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2700300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2700400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2700500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2700600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2700700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2700800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2700900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2701000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2701100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2701200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2701300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2701400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2701500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2701600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2701700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2701800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2701900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2702000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2702100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2702200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2702300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2702400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2702500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2702600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2702700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2702800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2702900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2703000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2703100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2703200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2703300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2703400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2703500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2703600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2703700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2703800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2703900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2704000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2704100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2704200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2704300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2704400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2704500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2704600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0285, - "step": 2704700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2704800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2704900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2705000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2705100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2705200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2705300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2705400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2705500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2705600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2705700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2705800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2705900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2706000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2706100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2706200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2706300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2706400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2706500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2706600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2706700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2706800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2706900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2707000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2707100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2707200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2707300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2707400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2707500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2707600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2707700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2707800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2707900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2708000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2708100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2708200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2708300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2708400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2708500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2708600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2708700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2708800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2708900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2709000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2709100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2709200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2709300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2709400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2709500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2709600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2709700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2709800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2709900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2710000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2710100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2710200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2710300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2710400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2710500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2710600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2710700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2710800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2710900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2711000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2711100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2711200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2711300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2711400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2711500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2711600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2711700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2711800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2711900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2712000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2712100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2712200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2712300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2712400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2712500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2712600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2712700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2712800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2712900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2713000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2713100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2713200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2713300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2713400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2713500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2713600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2713700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2713800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2713900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2714000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2714100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2714200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2714300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2714400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2714500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2714600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2714700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2714800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2714900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2715000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2715100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2715200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2715300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2715400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2715500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2715600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2715700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2715800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2715900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2716000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2716100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2716200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2716300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2716400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2716500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2716600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2716700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2716800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2716900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2717000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2717100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2717200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2717300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2717400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2717500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2717600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2717700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2717800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2717900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2718000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2718100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2718200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2718300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2718400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2718500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2718600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2718700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2718800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2718900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2719000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2719100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2719200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2719300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2719400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2719500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2719600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2719700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 2719800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0416, - "step": 2719900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2720000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028289794921875, - "eval_runtime": 3544.2998, - "eval_samples_per_second": 317.333, - "eval_steps_per_second": 19.834, - "step": 2720000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2720100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2720200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2720300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2720400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2720500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2720600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2720700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2720800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2720900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2721000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2721100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2721200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2721300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2721400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2721500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2721600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2721700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2721800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2721900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2722000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2722100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2722200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2722300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2722400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2722500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2722600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2722700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2722800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2722900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2723000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2723100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2723200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2723300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2723400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2723500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2723600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2723700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2723800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2723900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2724000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2724100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2724200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2724300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2724400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2724500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2724600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2724700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2724800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2724900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2725000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2725100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2725200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2725300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2725400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2725500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2725600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2725700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2725800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2725900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2726000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2726100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2726200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2726300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2726400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2726500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2726600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2726700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2726800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2726900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2727000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2727100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2727200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2727300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2727400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2727500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 2727600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2727700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2727800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2727900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2728000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2728100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2728200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2728300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2728400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2728500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2728600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2728700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2728800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2728900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2729000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2729100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2729200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2729300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2729400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2729500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2729600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2729700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2729800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2729900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2730000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2730100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2730200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2730300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2730400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2730500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2730600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2730700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2730800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2730900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2731000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2731100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2731200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2731300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2731400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2731500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2731600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2731700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2731800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0461, - "step": 2731900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0532, - "step": 2732000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2732100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2732200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2732300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2732400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2732500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2732600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2732700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2732800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2732900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2733000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2733100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2733200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2733300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2733400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2733500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2733600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2733700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2733800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2733900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2734000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2734100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2734200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2734300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2734400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2734500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2734600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2734700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2734800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2734900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2735000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2735100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2735200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2735300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2735400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0391, - "step": 2735500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2735600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2735700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2735800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2735900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2736000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2736100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2736200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2736300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2736400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2736500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2736600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2736700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2736800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2736900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2737000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2737100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2737200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2737300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2737400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2737500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2737600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2737700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2737800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2737900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2738000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2738100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2738200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2738300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2738400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2738500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2738600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2738700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2738800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2738900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2739000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2739100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2739200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2739300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2739400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2739500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2739600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2739700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2739800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2739900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2740000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028472900390625, - "eval_runtime": 3617.2028, - "eval_samples_per_second": 310.937, - "eval_steps_per_second": 19.434, - "step": 2740000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2740100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2740200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2740300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2740400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2740500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2740600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2740700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2740800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2740900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2741000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2741100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2741200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2741300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2741400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2741500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2741600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2741700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2741800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2741900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2742000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2742100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2742200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2742300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2742400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2742500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2742600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2742700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2742800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2742900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2743000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2743100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2743200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2743300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2743400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2743500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2743600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2743700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2743800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2743900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2744000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2744100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2744200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2744300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2744400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2744500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2744600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2744700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2744800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2744900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2745000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2745100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2745200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2745300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2745400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2745500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2745600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2745700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2745800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2745900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2746000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2746100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2746200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2746300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2746400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2746500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2746600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2746700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2746800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2746900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2747000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2747100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2747200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2747300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2747400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2747500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2747600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2747700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2747800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2747900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2748000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2748100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2748200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2748300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2748400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2748500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2748600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2748700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2748800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2748900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2749000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2749100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2749200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2749300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2749400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2749500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2749600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2749700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2749800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2749900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2750000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2750100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2750200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2750300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2750400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2750500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2750600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2750700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2750800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2750900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2751000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2751100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2751200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2751300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2751400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2751500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2751600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2751700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2751800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2751900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2752000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2752100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2752200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2752300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2752400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2752500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2752600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2752700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2752800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2752900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2753000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2753100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2753200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2753300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2753400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2753500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2753600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2753700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2753800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2753900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2754000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2754100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2754200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2754300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2754400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2754500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2754600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2754700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2754800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2754900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2755000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2755100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2755200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2755300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2755400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2755500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2755600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2755700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2755800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2755900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2756000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2756100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2756200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2756300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2756400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2756500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2756600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2756700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2756800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2756900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2757000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2757100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2757200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2757300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2757400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2757500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2757600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2757700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2757800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2757900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2758000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2758100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2758200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2758300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2758400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2758500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2758600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2758700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2758800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2758900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2759000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2759100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2759200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2759300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2759400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2759500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2759600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2759700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2759800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2759900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2760000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284576416015625, - "eval_runtime": 3352.5955, - "eval_samples_per_second": 335.478, - "eval_steps_per_second": 20.968, - "step": 2760000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2760100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2760200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2760300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2760400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2760500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2760600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2760700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2760800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2760900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2761000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2761100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2761200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2761300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2761400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2761500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2761600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2761700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2761800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2761900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2762000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2762100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2762200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2762300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2762400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2762500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2762600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2762700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2762800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2762900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2763000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2763100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2763200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2763300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2763400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2763500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2763600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2763700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2763800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2763900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2764000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2764100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2764200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2764300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2764400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2764500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2764600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2764700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2764800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2764900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2765000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2765100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2765200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2765300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2765400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2765500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2765600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2765700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2765800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2765900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2766000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2766100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2766200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2766300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2766400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2766500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2766600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2766700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2766800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2766900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2767000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2767100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2767200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2767300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2767400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2767500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2767600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2767700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2767800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2767900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2768000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2768100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2768200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2768300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2768400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2768500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2768600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2768700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2768800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2768900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2769000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2769100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2769200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2769300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2769400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2769500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2769600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2769700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2769800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2769900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2770000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2770100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2770200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2770300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2770400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2770500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2770600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2770700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2770800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2770900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2771000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2771100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2771200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2771300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2771400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2771500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2771600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2771700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2771800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2771900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2772000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2772100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2772200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2772300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2772400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2772500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2772600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2772700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2772800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2772900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2773000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2773100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2773200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2773300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2773400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2773500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2773600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2773700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2773800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2773900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2774000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2774100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2774200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2774300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2774400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2774500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2774600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2774700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2774800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2774900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2775000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2775100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2775200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2775300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2775400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2775500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2775600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2775700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2775800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2775900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2776000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2776100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2776200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2776300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2776400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2776500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2776600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2776700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2776800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2776900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2777000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2777100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2777200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2777300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2777400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2777500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2777600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2777700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2777800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2777900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2778000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2778100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2778200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2778300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2778400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2778500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2778600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2778700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2778800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2778900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2779000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2779100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2779200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2779300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2779400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2779500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2779600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2779700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2779800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2779900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2780000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0281829833984375, - "eval_runtime": 3210.4835, - "eval_samples_per_second": 350.328, - "eval_steps_per_second": 21.896, - "step": 2780000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2780100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2780200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2780300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2780400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2780500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2780600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2780700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2780800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2780900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2781000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2781100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2781200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2781300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2781400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2781500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2781600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2781700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2781800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2781900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2782000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2782100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2782200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2782300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2782400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2782500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2782600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2782700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2782800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2782900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2783000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2783100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2783200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2783300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2783400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2783500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2783600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2783700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2783800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2783900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2784000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2784100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2784200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2784300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2784400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2784500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2784600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2784700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2784800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2784900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2785000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2785100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2785200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2785300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2785400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2785500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2785600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2785700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2785800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2785900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2786000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2786100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2786200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2786300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2786400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2786500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2786600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2786700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2786800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2786900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2787000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2787100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2787200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2787300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2787400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2787500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2787600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2787700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2787800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2787900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2788000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2788100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2788200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2788300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2788400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2788500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2788600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2788700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2788800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2788900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2789000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2789100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2789200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2789300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2789400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2789500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2789600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2789700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2789800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2789900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2790000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2790100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2790200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2790300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2790400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2790500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2790600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2790700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2790800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2790900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2791000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2791100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2791200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2791300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2791400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2791500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2791600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2791700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2791800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2791900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2792000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2792100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2792200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2792300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2792400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2792500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2792600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2792700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2792800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2792900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2793000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2793100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2793200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2793300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2793400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2793500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2793600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2793700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2793800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2793900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2794000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2794100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2794200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2794300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2794400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2794500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2794600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2794700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2794800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2794900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2795000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2795100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2795200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2795300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2795400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2795500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2795600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2795700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2795800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2795900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2796000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2796100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2796200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2796300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2796400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2796500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2796600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2796700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2796800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2796900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2797000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2797100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2797200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2797300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2797400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2797500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2797600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2797700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2797800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2797900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2798000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2798100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2798200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2798300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2798400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2798500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2798600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2798700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2798800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2798900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2799000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2799100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2799200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2799300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2799400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2799500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2799600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2799700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2799800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2799900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0284, - "step": 2800000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283355712890625, - "eval_runtime": 3172.1198, - "eval_samples_per_second": 354.565, - "eval_steps_per_second": 22.161, - "step": 2800000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2800100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2800200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2800300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2800400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2800500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2800600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2800700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2800800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2800900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2801000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2801100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2801200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2801300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2801400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2801500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2801600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2801700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2801800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2801900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2802000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2802100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2802200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2802300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2802400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2802500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2802600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2802700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2802800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2802900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2803000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2803100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2803200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2803300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2803400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2803500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2803600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2803700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2803800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2803900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2804000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2804100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2804200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2804300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2804400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2804500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2804600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2804700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2804800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2804900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2805000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2805100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2805200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2805300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2805400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2805500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2805600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2805700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2805800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2805900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2806000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2806100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2806200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2806300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2806400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2806500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2806600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2806700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2806800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2806900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2807000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2807100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2807200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2807300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2807400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2807500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2807600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2807700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2807800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2807900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2808000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2808100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2808200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2808300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2808400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2808500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2808600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2808700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2808800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2808900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2809000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2809100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2809200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2809300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2809400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2809500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2809600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2809700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2809800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2809900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2810000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2810100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2810200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2810300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2810400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2810500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2810600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2810700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2810800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2810900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2811000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2811100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2811200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2811300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2811400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2811500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2811600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2811700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2811800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2811900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2812000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2812100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2812200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2812300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2812400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2812500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2812600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2812700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2812800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2812900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2813000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2813100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0389, - "step": 2813200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2813300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2813400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2813500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2813600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2813700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2813800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2813900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2814000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2814100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2814200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2814300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2814400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2814500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2814600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2814700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2814800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2814900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2815000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2815100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2815200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2815300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2815400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2815500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2815600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2815700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2815800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2815900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2816000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2816100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2816200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2816300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2816400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2816500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2816600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2816700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2816800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2816900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2817000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2817100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2817200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2817300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2817400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2817500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2817600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2817700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2817800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2817900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2818000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2818100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2818200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0351, - "step": 2818300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2818400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2818500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2818600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2818700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2818800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2818900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2819000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2819100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2819200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2819300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2819400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2819500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2819600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2819700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2819800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2819900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2820000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028533935546875, - "eval_runtime": 3062.156, - "eval_samples_per_second": 367.298, - "eval_steps_per_second": 22.956, - "step": 2820000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2820100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2820200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2820300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2820400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2820500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2820600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2820700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2820800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2820900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2821000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2821100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2821200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2821300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 2821400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2821500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2821600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2821700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2821800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2821900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2822000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2822100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2822200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2822300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2822400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2822500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2822600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2822700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2822800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2822900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2823000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2823100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2823200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2823300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2823400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2823500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2823600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2823700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2823800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2823900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2824000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2824100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2824200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2824300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2824400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2824500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2824600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2824700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2824800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2824900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2825000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2825100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2825200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2825300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2825400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2825500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2825600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2825700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2825800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2825900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2826000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2826100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2826200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2826300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2826400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2826500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2826600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2826700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2826800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2826900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2827000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2827100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2827200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2827300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2827400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2827500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2827600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2827700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2827800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2827900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2828000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2828100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2828200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2828300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2828400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2828500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2828600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2828700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2828800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2828900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2829000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2829100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2829200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2829300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2829400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2829500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2829600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2829700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2829800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2829900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2830000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2830100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2830200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2830300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2830400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2830500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2830600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2830700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2830800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2830900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2831000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2831100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2831200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2831300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2831400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2831500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2831600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2831700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2831800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2831900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2832000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2832100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2832200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2832300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2832400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2832500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2832600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2832700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2832800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2832900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2833000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2833100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2833200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2833300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2833400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2833500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2833600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2833700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2833800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2833900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2834000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2834100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2834200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2834300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2834400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2834500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2834600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2834700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2834800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2834900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2835000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 2835100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2835200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2835300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2835400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2835500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2835600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2835700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2835800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2835900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2836000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2836100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2836200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2836300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2836400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2836500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2836600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2836700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2836800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2836900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2837000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2837100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2837200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2837300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2837400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2837500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2837600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2837700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2837800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2837900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2838000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2838100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2838200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2838300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2838400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2838500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2838600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2838700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2838800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2838900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2839000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2839100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2839200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2839300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2839400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2839500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2839600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2839700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2839800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2839900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2840000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284576416015625, - "eval_runtime": 3026.4651, - "eval_samples_per_second": 371.629, - "eval_steps_per_second": 23.227, - "step": 2840000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2840100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2840200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2840300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2840400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2840500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2840600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2840700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2840800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2840900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2841000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2841100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2841200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2841300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2841400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2841500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2841600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2841700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2841800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2841900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2842000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2842100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2842200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2842300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2842400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2842500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2842600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2842700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2842800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2842900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2843000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2843100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2843200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2843300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2843400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2843500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2843600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2843700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2843800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2843900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2844000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2844100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2844200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2844300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2844400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2844500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2844600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2844700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2844800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2844900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2845000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2845100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2845200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2845300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2845400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2845500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2845600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2845700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2845800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2845900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2846000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2846100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2846200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2846300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2846400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2846500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2846600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2846700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2846800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2846900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2847000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2847100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2847200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2847300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2847400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2847500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2847600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2847700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2847800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2847900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2848000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2848100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2848200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0586, - "step": 2848300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2848400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2848500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2848600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2848700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2848800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2848900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2849000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2849100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2849200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2849300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2849400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2849500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2849600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2849700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2849800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2849900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2850000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2850100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2850200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2850300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2850400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2850500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2850600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2850700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2850800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2850900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2851000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2851100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2851200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2851300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2851400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2851500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2851600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2851700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2851800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2851900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2852000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2852100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2852200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2852300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2852400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2852500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2852600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2852700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2852800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2852900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2853000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2853100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2853200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2853300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2853400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2853500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2853600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2853700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2853800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2853900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2854000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2854100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2854200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2854300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2854400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2854500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2854600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2854700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2854800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2854900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2855000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2855100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2855200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2855300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2855400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2855500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2855600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2855700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2855800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2855900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2856000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2856100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2856200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2856300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2856400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2856500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2856600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2856700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2856800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2856900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2857000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2857100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2857200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2857300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2857400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2857500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2857600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2857700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2857800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2857900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2858000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2858100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2858200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2858300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2858400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2858500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2858600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2858700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2858800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2858900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2859000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2859100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2859200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2859300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2859400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2859500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2859600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2859700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2859800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2859900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2860000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028289794921875, - "eval_runtime": 2980.4196, - "eval_samples_per_second": 377.371, - "eval_steps_per_second": 23.586, - "step": 2860000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2860100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2860200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2860300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2860400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2860500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2860600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2860700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2860800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2860900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2861000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2861100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2861200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2861300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2861400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2861500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2861600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2861700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2861800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2861900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2862000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2862100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2862200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2862300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2862400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2862500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2862600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2862700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2862800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2862900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2863000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2863100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2863200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2863300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2863400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2863500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2863600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2863700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2863800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2863900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2864000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2864100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2864200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2864300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2864400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2864500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2864600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2864700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2864800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2864900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2865000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2865100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 2865200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2865300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2865400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2865500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 2865600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2865700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2865800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2865900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2866000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2866100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2866200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0457, - "step": 2866300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2866400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2866500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2866600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2866700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2866800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2866900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2867000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2867100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2867200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2867300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2867400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2867500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2867600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 2867700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2867800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2867900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2868000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2868100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2868200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2868300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2868400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2868500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2868600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2868700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2868800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.037, - "step": 2868900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2869000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2869100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2869200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2869300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2869400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2869500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2869600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2869700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2869800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2869900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2870000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2870100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2870200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2870300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2870400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2870500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2870600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2870700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2870800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0361, - "step": 2870900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2871000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2871100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2871200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2871300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2871400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2871500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2871600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2871700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2871800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2871900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2872000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2872100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2872200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2872300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2872400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2872500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2872600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2872700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2872800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2872900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2873000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2873100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2873200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2873300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2873400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2873500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2873600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2873700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2873800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2873900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2874000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2874100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2874200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2874300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2874400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2874500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2874600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2874700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2874800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2874900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2875000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2875100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2875200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2875300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2875400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2875500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2875600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2875700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2875800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2875900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2876000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 2876100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2876200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2876300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2876400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2876500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2876600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2876700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2876800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2876900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2877000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2877100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2877200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2877300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2877400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2877500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2877600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2877700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2877800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2877900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2878000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2878100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2878200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2878300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2878400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2878500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2878600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2878700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2878800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2878900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2879000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2879100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2879200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2879300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2879400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2879500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2879600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2879700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2879800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2879900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2880000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283660888671875, - "eval_runtime": 2966.4786, - "eval_samples_per_second": 379.144, - "eval_steps_per_second": 23.697, - "step": 2880000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2880100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2880200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2880300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2880400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2880500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2880600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2880700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2880800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2880900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2881000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2881100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2881200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2881300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2881400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2881500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2881600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2881700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2881800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2881900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2882000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2882100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2882200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2882300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2882400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2882500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2882600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2882700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2882800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2882900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2883000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2883100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2883200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2883300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2883400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2883500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2883600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2883700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2883800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2883900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2884000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2884100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2884200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2884300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2884400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2884500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2884600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2884700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2884800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2884900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2885000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2885100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2885200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2885300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2885400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2885500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2885600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2885700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2885800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2885900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2886000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2886100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2886200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2886300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2886400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2886500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2886600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2886700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2886800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2886900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2887000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2887100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2887200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2887300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2887400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2887500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2887600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2887700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2887800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2887900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2888000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2888100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2888200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2888300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2888400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2888500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2888600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2888700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2888800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2888900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2889000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2889100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2889200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2889300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2889400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2889500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2889600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2889700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2889800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2889900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2890000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2890100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2890200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2890300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2890400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2890500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2890600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2890700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2890800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2890900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2891000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2891100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2891200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2891300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2891400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2891500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2891600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2891700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2891800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2891900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2892000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2892100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2892200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2892300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2892400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2892500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0371, - "step": 2892600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2892700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2892800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2892900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2893000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2893100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2893200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2893300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2893400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2893500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2893600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2893700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2893800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2893900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2894000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2894100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2894200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2894300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2894400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2894500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2894600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2894700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2894800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2894900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2895000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2895100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2895200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2895300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2895400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2895500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2895600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2895700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2895800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2895900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2896000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2896100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2896200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2896300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2896400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2896500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2896600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2896700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2896800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2896900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2897000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2897100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2897200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2897300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2897400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2897500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2897600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2897700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2897800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2897900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2898000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2898100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2898200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2898300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2898400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2898500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2898600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2898700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2898800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2898900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2899000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2899100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2899200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2899300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2899400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2899500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2899600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2899700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2899800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2899900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2900000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283355712890625, - "eval_runtime": 2977.4801, - "eval_samples_per_second": 377.743, - "eval_steps_per_second": 23.609, - "step": 2900000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2900100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2900200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2900300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2900400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2900500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2900600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2900700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2900800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2900900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2901000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2901100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2901200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2901300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2901400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2901500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2901600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2901700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2901800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2901900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2902000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2902100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2902200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2902300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2902400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2902500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2902600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2902700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2902800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2902900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2903000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2903100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2903200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2903300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2903400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2903500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2903600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2903700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2903800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2903900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2904000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2904100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2904200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2904300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2904400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2904500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2904600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2904700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2904800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2904900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2905000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2905100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2905200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2905300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2905400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2905500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2905600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2905700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2905800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2905900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2906000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2906100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2906200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2906300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2906400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2906500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2906600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2906700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2906800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2906900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2907000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2907100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2907200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2907300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2907400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2907500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2907600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2907700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2907800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2907900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2908000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2908100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2908200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2908300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2908400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2908500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2908600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2908700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2908800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2908900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2909000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2909100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2909200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2909300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2909400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2909500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2909600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2909700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2909800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2909900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2910000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2910100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2910200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2910300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2910400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2910500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2910600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2910700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2910800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2910900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2911000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2911100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2911200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2911300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2911400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2911500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2911600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2911700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2911800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2911900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2912000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2912100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2912200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2912300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2912400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2912500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2912600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2912700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2912800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2912900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0383, - "step": 2913000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2913100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2913200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2913300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2913400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2913500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2913600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2913700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2913800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2913900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2914000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2914100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0502, - "step": 2914200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2914300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2914400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2914500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2914600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2914700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2914800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2914900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2915000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2915100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2915200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2915300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2915400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2915500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2915600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2915700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2915800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2915900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2916000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2916100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2916200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2916300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2916400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2916500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2916600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 2916700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2916800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2916900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2917000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2917100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2917200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2917300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2917400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2917500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2917600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2917700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2917800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2917900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2918000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2918100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2918200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2918300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2918400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2918500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2918600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2918700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2918800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2918900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2919000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2919100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2919200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2919300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2919400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2919500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2919600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2919700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2919800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2919900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2920000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0279388427734375, - "eval_runtime": 2996.4079, - "eval_samples_per_second": 375.357, - "eval_steps_per_second": 23.46, - "step": 2920000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2920100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2920200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2920300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2920400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2920500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2920600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2920700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2920800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2920900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2921000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2921100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2921200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2921300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2921400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2921500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2921600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2921700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2921800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2921900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2922000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2922100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2922200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2922300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2922400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2922500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2922600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2922700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2922800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2922900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2923000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2923100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2923200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2923300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2923400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2923500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2923600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2923700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2923800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2923900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2924000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2924100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2924200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2924300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 2924400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2924500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2924600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2924700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2924800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2924900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2925000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2925100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2925200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2925300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2925400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2925500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2925600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2925700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2925800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2925900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2926000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2926100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2926200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2926300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2926400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2926500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2926600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2926700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2926800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2926900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2927000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2927100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2927200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2927300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2927400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2927500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2927600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2927700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2927800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2927900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2928000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2928100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2928200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2928300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2928400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2928500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2928600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2928700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2928800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2928900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2929000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2929100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2929200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2929300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2929400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2929500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2929600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2929700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2929800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2929900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2930000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2930100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2930200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2930300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2930400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2930500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2930600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2930700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2930800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2930900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2931000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2931100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2931200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2931300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2931400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2931500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2931600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2931700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2931800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2931900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2932000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2932100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2932200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2932300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2932400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2932500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2932600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2932700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2932800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2932900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2933000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2933100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2933200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2933300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2933400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2933500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2933600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2933700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2933800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2933900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2934000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2934100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2934200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2934300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2934400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2934500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2934600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2934700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2934800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2934900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2935000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2935100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2935200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2935300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2935400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2935500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2935600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2935700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2935800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2935900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2936000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2936100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2936200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2936300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2936400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2936500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2936600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 2936700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2936800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 2936900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2937000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 2937100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2937200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2937300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2937400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2937500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2937600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2937700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2937800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2937900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2938000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2938100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2938200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2938300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2938400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2938500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2938600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2938700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2938800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2938900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2939000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2939100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2939200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2939300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 2939400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2939500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2939600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2939700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2939800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2939900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2940000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283050537109375, - "eval_runtime": 3008.1111, - "eval_samples_per_second": 373.897, - "eval_steps_per_second": 23.369, - "step": 2940000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0445, - "step": 2940100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2940200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2940300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2940400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2940500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2940600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2940700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2940800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2940900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2941000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2941100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2941200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2941300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2941400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 2941500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2941600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2941700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2941800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2941900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2942000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2942100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2942200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2942300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2942400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2942500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2942600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2942700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2942800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2942900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2943000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2943100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2943200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2943300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2943400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2943500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2943600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2943700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 2943800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2943900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2944000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2944100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2944200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2944300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2944400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2944500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2944600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2944700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2944800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2944900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2945000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2945100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2945200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2945300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2945400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2945500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2945600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2945700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2945800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2945900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2946000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2946100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2946200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2946300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2946400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2946500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2946600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2946700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2946800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2946900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2947000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2947100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2947200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2947300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2947400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2947500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2947600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2947700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2947800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2947900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2948000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2948100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2948200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2948300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2948400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2948500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2948600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2948700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2948800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2948900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2949000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2949100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2949200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2949300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2949400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2949500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2949600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2949700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2949800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2949900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2950000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2950100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2950200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2950300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2950400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2950500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2950600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2950700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2950800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2950900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2951000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2951100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2951200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2951300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2951400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2951500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2951600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2951700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2951800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2951900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2952000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2952100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2952200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2952300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2952400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2952500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2952600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2952700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2952800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2952900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2953000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2953100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2953200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2953300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2953400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2953500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2953600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2953700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2953800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2953900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2954000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2954100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2954200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2954300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2954400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2954500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2954600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2954700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2954800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2954900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2955000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2955100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2955200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2955300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2955400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2955500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2955600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2955700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2955800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2955900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2956000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2956100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2956200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2956300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2956400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2956500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2956600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2956700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2956800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2956900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2957000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2957100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2957200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2957300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2957400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2957500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 2957600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2957700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2957800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2957900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2958000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2958100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2958200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2958300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2958400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2958500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2958600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2958700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2958800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2958900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2959000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2959100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2959200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2959300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2959400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2959500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2959600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2959700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2959800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2959900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2960000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02825927734375, - "eval_runtime": 3374.7233, - "eval_samples_per_second": 333.279, - "eval_steps_per_second": 20.83, - "step": 2960000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2960100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2960200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2960300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2960400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2960500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2960600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2960700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2960800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2960900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2961000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2961100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2961200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2961300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2961400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2961500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2961600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 2961700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2961800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2961900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2962000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2962100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2962200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2962300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2962400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2962500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2962600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2962700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2962800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2962900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2963000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2963100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2963200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2963300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2963400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2963500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2963600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2963700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2963800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2963900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2964000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2964100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2964200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2964300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2964400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2964500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2964600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2964700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2964800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2964900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2965000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2965100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2965200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2965300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2965400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2965500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2965600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2965700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2965800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2965900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2966000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2966100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2966200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2966300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2966400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2966500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2966600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2966700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2966800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2966900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2967000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2967100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2967200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2967300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2967400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2967500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2967600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2967700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2967800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2967900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2968000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2968100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2968200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2968300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2968400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2968500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2968600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2968700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2968800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2968900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2969000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2969100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2969200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2969300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2969400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2969500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2969600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2969700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2969800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2969900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2970000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2970100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2970200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2970300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2970400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2970500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2970600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2970700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2970800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2970900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2971000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2971100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0344, - "step": 2971200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2971300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2971400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2971500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2971600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2971700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0503, - "step": 2971800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0439, - "step": 2971900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 2972000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2972100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2972200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0404, - "step": 2972300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.039, - "step": 2972400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 2972500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 2972600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2972700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2972800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 2972900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2973000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2973100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2973200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2973300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2973400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2973500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2973600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2973700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2973800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2973900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2974000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 2974100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2974200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2974300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 2974400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 2974500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2974600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2974700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2974800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 2974900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2975000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2975100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2975200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2975300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2975400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2975500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2975600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2975700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2975800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2975900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 2976000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 2976100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2976200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2976300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2976400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2976500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2976600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2976700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2976800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2976900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2977000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2977100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2977200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2977300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2977400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2977500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2977600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2977700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2977800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2977900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2978000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2978100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2978200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2978300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2978400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2978500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2978600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2978700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2978800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2978900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2979000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2979100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2979200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2979300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2979400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2979500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2979600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2979700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2979800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2979900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2980000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02874755859375, - "eval_runtime": 3357.7861, - "eval_samples_per_second": 334.96, - "eval_steps_per_second": 20.935, - "step": 2980000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2980100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2980200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 2980300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2980400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2980500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2980600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2980700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2980800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2980900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2981000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2981100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2981200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2981300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2981400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2981500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2981600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2981700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2981800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2981900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2982000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2982100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 2982200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2982300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2982400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 2982500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 2982600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2982700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2982800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2982900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2983000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2983100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2983200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2983300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2983400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2983500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2983600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2983700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2983800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2983900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2984000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 2984100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2984200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2984300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2984400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2984500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2984600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2984700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2984800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2984900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2985000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2985100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2985200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2985300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2985400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2985500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 2985600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2985700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2985800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2985900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2986000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2986100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2986200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2986300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2986400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 2986500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2986600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2986700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2986800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2986900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2987000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2987100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2987200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2987300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2987400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2987500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2987600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2987700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2987800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2987900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2988000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2988100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2988200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2988300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2988400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2988500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2988600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2988700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2988800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2988900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2989000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2989100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 2989200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2989300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2989400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 2989500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2989600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2989700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2989800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2989900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2990000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2990100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2990200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2990300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 2990400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2990500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2990600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2990700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2990800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2990900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2991000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 2991100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2991200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2991300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2991400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2991500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2991600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2991700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 2991800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2991900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2992000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 2992100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2992200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2992300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2992400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2992500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2992600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2992700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 2992800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2992900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2993000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2993100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2993200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 2993300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2993400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 2993500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2993600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2993700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0372, - "step": 2993800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2993900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 2994000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2994100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 2994200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2994300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2994400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2994500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2994600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 2994700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 2994800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2994900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 2995000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2995100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2995200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 2995300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2995400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2995500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2995600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2995700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2995800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2995900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2996000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2996100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2996200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2996300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 2996400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 2996500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 2996600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2996700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 2996800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 2996900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 2997000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 2997100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2997200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2997300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2997400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2997500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2997600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 2997700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2997800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2997900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 2998000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2998100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2998200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2998300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 2998400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2998500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 2998600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 2998700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 2998800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 2998900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 2999000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 2999100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 2999200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 2999300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 2999400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 2999500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 2999600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 2999700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 2999800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 2999900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3000000 - }, - { - "epoch": 0.0, - "eval_loss": 0.027923583984375, - "eval_runtime": 3313.1307, - "eval_samples_per_second": 339.474, - "eval_steps_per_second": 21.217, - "step": 3000000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3000100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3000200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3000300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3000400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3000500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3000600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3000700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3000800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3000900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3001000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3001100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3001200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3001300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3001400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3001500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3001600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3001700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3001800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3001900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3002000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3002100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0384, - "step": 3002200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0409, - "step": 3002300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 3002400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3002500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3002600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3002700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3002800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3002900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3003000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3003100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0283, - "step": 3003200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3003300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3003400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3003500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3003600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3003700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3003800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3003900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3004000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3004100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3004200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3004300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3004400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3004500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3004600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3004700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3004800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3004900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3005000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3005100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3005200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3005300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3005400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3005500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3005600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3005700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3005800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3005900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3006000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3006100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3006200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3006300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3006400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3006500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3006600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3006700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3006800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3006900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3007000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3007100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3007200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3007300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3007400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3007500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3007600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3007700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3007800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3007900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3008000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3008100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3008200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3008300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3008400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3008500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3008600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3008700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3008800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3008900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 3009000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3009100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3009200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3009300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3009400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3009500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3009600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3009700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3009800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3009900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3010000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3010100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3010200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3010300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3010400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3010500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3010600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3010700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3010800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 3010900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3011000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3011100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 3011200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3011300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3011400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3011500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3011600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3011700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3011800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3011900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3012000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3012100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3012200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3012300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3012400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3012500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3012600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3012700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3012800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3012900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3013000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3013100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3013200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3013300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3013400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3013500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3013600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3013700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3013800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3013900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3014000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3014100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3014200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3014300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3014400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3014500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3014600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3014700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3014800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3014900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3015000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3015100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3015200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3015300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3015400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3015500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3015600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3015700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3015800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3015900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3016000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3016100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3016200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3016300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3016400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3016500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3016600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3016700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 3016800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3016900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3017000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3017100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3017200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3017300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3017400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3017500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3017600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3017700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3017800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3017900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3018000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0286, - "step": 3018100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3018200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3018300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3018400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3018500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3018600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3018700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3018800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3018900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3019000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3019100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3019200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3019300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3019400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3019500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3019600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 3019700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3019800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3019900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3020000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0284576416015625, - "eval_runtime": 3375.5001, - "eval_samples_per_second": 333.202, - "eval_steps_per_second": 20.825, - "step": 3020000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3020100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3020200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3020300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3020400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3020500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3020600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3020700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 3020800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3020900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3021000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3021100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3021200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3021300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3021400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3021500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3021600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3021700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3021800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3021900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3022000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3022100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 3022200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3022300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3022400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3022500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3022600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3022700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3022800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3022900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 3023000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3023100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3023200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3023300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 3023400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3023500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3023600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3023700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3023800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3023900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3024000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3024100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3024200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3024300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3024400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3024500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3024600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3024700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3024800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3024900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3025000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3025100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3025200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3025300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 3025400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3025500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3025600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3025700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3025800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3025900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3026000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3026100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3026200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3026300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 3026400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3026500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3026600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3026700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3026800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3026900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0382, - "step": 3027000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3027100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3027200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 3027300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 3027400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3027500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3027600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3027700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3027800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3027900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3028000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3028100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3028200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3028300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3028400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3028500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3028600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3028700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3028800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3028900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3029000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3029100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3029200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3029300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3029400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3029500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3029600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 3029700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0359, - "step": 3029800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 3029900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 3030000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0357, - "step": 3030100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3030200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3030300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 3030400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3030500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3030600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3030700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3030800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3030900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3031000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3031100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3031200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3031300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3031400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3031500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3031600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3031700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3031800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3031900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3032000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3032100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3032200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3032300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3032400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3032500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3032600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3032700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3032800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3032900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3033000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3033100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3033200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3033300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3033400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3033500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3033600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3033700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3033800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3033900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3034000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3034100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3034200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3034300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3034400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3034500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3034600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3034700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 3034800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3034900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3035000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3035100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3035200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3035300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3035400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3035500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3035600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3035700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3035800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3035900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3036000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3036100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3036200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3036300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3036400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3036500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3036600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3036700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3036800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3036900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3037000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3037100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3037200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3037300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3037400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3037500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3037600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3037700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3037800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3037900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3038000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3038100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3038200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3038300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3038400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3038500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3038600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3038700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3038800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3038900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3039000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3039100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3039200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3039300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3039400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 3039500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3039600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3039700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3039800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3039900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3040000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0287017822265625, - "eval_runtime": 3318.2668, - "eval_samples_per_second": 338.949, - "eval_steps_per_second": 21.185, - "step": 3040000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3040100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3040200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3040300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3040400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3040500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3040600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3040700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3040800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3040900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3041000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3041100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3041200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3041300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3041400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3041500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3041600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3041700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3041800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3041900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3042000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3042100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3042200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3042300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3042400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3042500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3042600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3042700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3042800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3042900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3043000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3043100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3043200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3043300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3043400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3043500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3043600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3043700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3043800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3043900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3044000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3044100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3044200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3044300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3044400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3044500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3044600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3044700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3044800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3044900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3045000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3045100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3045200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3045300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3045400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3045500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3045600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3045700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3045800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3045900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3046000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3046100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3046200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3046300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3046400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3046500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3046600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3046700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3046800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3046900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3047000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3047100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3047200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3047300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3047400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3047500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3047600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3047700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 3047800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3047900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3048000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3048100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3048200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3048300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3048400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3048500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3048600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3048700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3048800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3048900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3049000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3049100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3049200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3049300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3049400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3049500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3049600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3049700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3049800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0411, - "step": 3049900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 3050000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3050100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3050200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3050300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3050400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3050500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3050600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3050700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3050800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3050900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3051000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3051100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3051200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3051300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3051400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3051500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3051600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3051700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3051800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3051900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 3052000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3052100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3052200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3052300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3052400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3052500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3052600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3052700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3052800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3052900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3053000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3053100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3053200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3053300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3053400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3053500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3053600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3053700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3053800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3053900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3054000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3054100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3054200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3054300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3054400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3054500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3054600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3054700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3054800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3054900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3055000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3055100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3055200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3055300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3055400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3055500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3055600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3055700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3055800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3055900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3056000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3056100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3056200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3056300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3056400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3056500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3056600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3056700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3056800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3056900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3057000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 3057100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3057200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3057300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3057400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3057500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3057600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3057700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3057800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3057900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3058000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3058100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3058200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3058300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3058400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3058500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3058600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3058700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3058800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3058900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3059000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3059100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3059200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3059300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3059400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3059500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3059600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3059700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3059800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3059900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3060000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0281982421875, - "eval_runtime": 3454.829, - "eval_samples_per_second": 325.551, - "eval_steps_per_second": 20.347, - "step": 3060000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3060100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0414, - "step": 3060200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3060300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3060400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3060500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3060600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3060700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3060800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3060900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3061000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3061100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3061200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3061300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3061400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3061500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3061600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0284, - "step": 3061700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3061800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3061900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3062000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3062100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3062200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3062300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3062400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3062500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3062600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3062700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 3062800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3062900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3063000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3063100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3063200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3063300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3063400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3063500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3063600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3063700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3063800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3063900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3064000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3064100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3064200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3064300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3064400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3064500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3064600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3064700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0285, - "step": 3064800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3064900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3065000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3065100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3065200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3065300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3065400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3065500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3065600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3065700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3065800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3065900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3066000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3066100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3066200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3066300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3066400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3066500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3066600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3066700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3066800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3066900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3067000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3067100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3067200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3067300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3067400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3067500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1114, - "step": 3067600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3067700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3067800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3067900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3068000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3068100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3068200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3068300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3068400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3068500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3068600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3068700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3068800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 3068900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3069000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3069100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3069200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3069300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3069400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3069500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3069600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3069700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 3069800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3069900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3070000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3070100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3070200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3070300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3070400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3070500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3070600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3070700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 3070800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3070900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3071000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3071100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3071200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3071300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3071400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3071500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 3071600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3071700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3071800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3071900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3072000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3072100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3072200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3072300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3072400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3072500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3072600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3072700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3072800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3072900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3073000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3073100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3073200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3073300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3073400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3073500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3073600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3073700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0341, - "step": 3073800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3073900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0349, - "step": 3074000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3074100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3074200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3074300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3074400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3074500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3074600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3074700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3074800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3074900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3075000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3075100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3075200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3075300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3075400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3075500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3075600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3075700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3075800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3075900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3076000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3076100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3076200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3076300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3076400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3076500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3076600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3076700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3076800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3076900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3077000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0376, - "step": 3077100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3077200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3077300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0459, - "step": 3077400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3077500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3077600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3077700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3077800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0423, - "step": 3077900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0366, - "step": 3078000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3078100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3078200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3078300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3078400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3078500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3078600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3078700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3078800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3078900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3079000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3079100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3079200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3079300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3079400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3079500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3079600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3079700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3079800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3079900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0375, - "step": 3080000 - }, - { - "epoch": 0.0, - "eval_loss": 0.03399658203125, - "eval_runtime": 3368.1465, - "eval_samples_per_second": 333.929, - "eval_steps_per_second": 20.871, - "step": 3080000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 3080100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3080200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3080300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3080400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3080500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3080600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3080700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3080800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3080900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3081000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3081100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3081200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3081300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3081400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3081500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3081600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3081700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3081800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3081900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3082000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0282, - "step": 3082100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3082200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3082300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3082400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3082500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3082600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3082700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3082800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3082900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3083000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3083100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3083200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3083300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3083400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3083500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3083600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 3083700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3083800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 3083900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3084000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3084100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3084200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3084300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3084400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3084500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3084600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3084700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3084800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3084900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3085000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3085100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3085200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3085300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 3085400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3085500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3085600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3085700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3085800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3085900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3086000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3086100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3086200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3086300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3086400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3086500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3086600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3086700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3086800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 3086900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3087000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3087100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3087200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3087300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3087400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3087500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3087600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3087700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3087800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3087900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3088000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3088100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3088200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3088300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3088400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3088500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3088600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3088700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3088800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3088900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3089000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3089100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0343, - "step": 3089200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3089300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 3089400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 3089500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3089600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3089700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3089800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3089900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3090000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 3090100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3090200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3090300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3090400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3090500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3090600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3090700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3090800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3090900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3091000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3091100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3091200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3091300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3091400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3091500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3091600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3091700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3091800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3091900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3092000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3092100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3092200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3092300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3092400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3092500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3092600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3092700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3092800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3092900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3093000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3093100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3093200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3093300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3093400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3093500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3093600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3093700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3093800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3093900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3094000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3094100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3094200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3094300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3094400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0284, - "step": 3094500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3094600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0283, - "step": 3094700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3094800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3094900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3095000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 3095100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3095200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3095300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3095400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3095500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3095600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3095700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3095800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3095900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3096000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3096100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3096200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3096300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3096400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3096500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3096600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3096700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3096800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3096900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3097000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3097100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3097200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3097300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3097400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3097500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3097600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3097700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3097800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3097900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3098000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3098100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3098200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3098300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3098400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3098500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3098600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3098700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3098800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3098900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.034, - "step": 3099000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3099100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3099200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3099300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3099400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3099500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3099600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3099700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3099800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3099900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3100000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0283203125, - "eval_runtime": 3298.1173, - "eval_samples_per_second": 341.02, - "eval_steps_per_second": 21.314, - "step": 3100000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3100100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3100200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3100300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3100400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3100500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3100600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 3100700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3100800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3100900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3101000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3101100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3101200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 3101300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3101400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3101500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3101600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3101700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3101800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3101900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3102000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3102100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3102200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3102300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3102400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0284, - "step": 3102500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3102600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3102700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3102800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3102900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3103000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3103100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3103200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3103300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0288, - "step": 3103400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 3103500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3103600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.1013, - "step": 3103700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0402, - "step": 3103800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 3103900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3104000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3104100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3104200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3104300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3104400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3104500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 3104600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3104700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0335, - "step": 3104800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3104900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3105000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3105100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3105200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3105300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3105400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3105500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3105600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3105700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3105800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3105900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3106000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3106100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3106200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 3106300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3106400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3106500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3106600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3106700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3106800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3106900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3107000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3107100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3107200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3107300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3107400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3107500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3107600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3107700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3107800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3107900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3108000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3108100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3108200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3108300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3108400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3108500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3108600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3108700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3108800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3108900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3109000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3109100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3109200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3109300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3109400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3109500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3109600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3109700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3109800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0476, - "step": 3109900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.04, - "step": 3110000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.035, - "step": 3110100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 3110200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3110300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3110400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3110500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3110600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3110700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3110800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3110900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3111000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3111100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3111200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3111300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3111400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3111500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0353, - "step": 3111600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3111700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3111800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3111900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3112000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3112100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3112200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3112300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3112400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3112500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3112600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3112700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3112800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3112900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3113000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3113100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3113200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3113300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3113400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3113500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3113600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3113700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 3113800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3113900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3114000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3114100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3114200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3114300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0347, - "step": 3114400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3114500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3114600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3114700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 3114800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 3114900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3115000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3115100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0345, - "step": 3115200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3115300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3115400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3115500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3115600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 3115700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0342, - "step": 3115800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3115900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 3116000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3116100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3116200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3116300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3116400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3116500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3116600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3116700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3116800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3116900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3117000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3117100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3117200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3117300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3117400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3117500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3117600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3117700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3117800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3117900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3118000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3118100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3118200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3118300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3118400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3118500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3118600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3118700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3118800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3118900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3119000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0387, - "step": 3119100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3119200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3119300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0379, - "step": 3119400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3119500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0348, - "step": 3119600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3119700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3119800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3119900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3120000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0281982421875, - "eval_runtime": 3281.7064, - "eval_samples_per_second": 342.725, - "eval_steps_per_second": 21.421, - "step": 3120000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3120100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3120200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3120300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3120400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3120500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3120600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3120700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3120800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3120900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3121000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3121100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3121200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3121300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3121400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3121500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3121600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3121700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3121800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3121900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3122000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3122100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3122200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3122300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3122400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3122500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3122600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3122700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3122800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3122900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3123000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3123100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3123200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3123300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3123400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3123500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3123600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3123700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3123800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3123900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3124000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3124100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3124200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3124300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3124400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3124500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3124600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3124700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0284, - "step": 3124800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3124900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3125000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3125100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3125200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3125300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3125400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3125500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3125600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3125700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3125800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3125900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3126000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3126100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3126200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3126300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3126400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3126500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3126600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3126700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3126800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3126900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3127000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3127100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3127200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 3127300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3127400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3127500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3127600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3127700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3127800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3127900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3128000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3128100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3128200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3128300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3128400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3128500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3128600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3128700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3128800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3128900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3129000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3129100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3129200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3129300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3129400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3129500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3129600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3129700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3129800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3129900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0287, - "step": 3130000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3130100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3130200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3130300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3130400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3130500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3130600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3130700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3130800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3130900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3131000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3131100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3131200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3131300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3131400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3131500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3131600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3131700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3131800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3131900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 3132000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3132100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3132200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3132300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3132400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3132500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3132600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3132700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3132800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3132900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3133000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3133100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3133200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3133300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3133400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3133500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.036, - "step": 3133600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3133700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3133800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3133900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3134000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3134100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3134200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3134300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3134400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3134500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3134600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3134700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3134800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3134900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3135000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3135100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3135200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3135300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3135400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3135500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3135600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3135700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3135800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3135900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3136000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3136100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3136200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3136300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3136400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3136500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3136600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3136700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3136800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3136900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3137000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3137100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3137200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3137300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3137400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3137500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3137600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3137700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3137800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3137900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3138000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3138100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3138200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3138300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3138400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3138500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3138600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3138700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3138800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 3138900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3139000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3139100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3139200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3139300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3139400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3139500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3139600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3139700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3139800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3139900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0328, - "step": 3140000 - }, - { - "epoch": 0.0, - "eval_loss": 0.028167724609375, - "eval_runtime": 3373.5486, - "eval_samples_per_second": 333.395, - "eval_steps_per_second": 20.837, - "step": 3140000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3140100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3140200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3140300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3140400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3140500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3140600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3140700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3140800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3140900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3141000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3141100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3141200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3141300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3141400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3141500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3141600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3141700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3141800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3141900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3142000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3142100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3142200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3142300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3142400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3142500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3142600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3142700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0338, - "step": 3142800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0355, - "step": 3142900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3143000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3143100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3143200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3143300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3143400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3143500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3143600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3143700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3143800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3143900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3144000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3144100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3144200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3144300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3144400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3144500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3144600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3144700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3144800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3144900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3145000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3145100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3145200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3145300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3145400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3145500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 3145600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3145700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3145800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3145900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3146000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3146100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3146200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3146300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3146400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3146500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3146600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3146700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3146800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3146900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3147000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3147100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3147200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3147300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3147400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3147500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3147600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3147700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3147800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3147900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3148000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3148100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3148200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3148300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3148400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3148500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3148600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3148700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3148800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3148900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3149000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3149100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3149200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3149300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.029, - "step": 3149400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3149500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3149600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3149700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3149800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3149900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3150000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3150100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3150200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3150300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3150400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3150500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3150600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3150700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3150800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3150900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3151000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3151100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3151200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3151300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3151400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3151500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3151600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3151700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3151800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3151900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3152000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3152100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0337, - "step": 3152200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3152300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3152400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3152500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3152600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3152700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3152800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3152900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3153000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3153100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3153200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3153300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3153400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3153500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3153600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3153700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3153800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3153900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3154000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3154100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3154200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3154300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3154400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3154500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3154600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3154700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3154800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3154900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3155000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0332, - "step": 3155100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0327, - "step": 3155200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3155300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3155400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3155500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3155600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3155700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0354, - "step": 3155800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3155900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3156000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3156100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3156200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3156300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3156400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3156500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3156600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3156700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3156800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3156900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3157000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3157100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3157200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3157300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3157400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3157500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3157600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3157700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3157800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3157900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3158000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3158100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3158200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3158300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3158400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3158500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3158600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3158700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3158800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3158900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3159000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3159100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3159200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3159300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3159400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3159500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3159600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3159700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3159800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3159900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3160000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02813720703125, - "eval_runtime": 3409.7011, - "eval_samples_per_second": 329.86, - "eval_steps_per_second": 20.616, - "step": 3160000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3160100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3160200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3160300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3160400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3160500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3160600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3160700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3160800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3160900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3161000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3161100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3161200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3161300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3161400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3161500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3161600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3161700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3161800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3161900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3162000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0329, - "step": 3162100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3162200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3162300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0364, - "step": 3162400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3162500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3162600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3162700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3162800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3162900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3163000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3163100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3163200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3163300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3163400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3163500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3163600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3163700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3163800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3163900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3164000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3164100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3164200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3164300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3164400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3164500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3164600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3164700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3164800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3164900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3165000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3165100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3165200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3165300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3165400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3165500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0319, - "step": 3165600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3165700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3165800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3165900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3166000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3166100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3166200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0285, - "step": 3166300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3166400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3166500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3166600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0305, - "step": 3166700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3166800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3166900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3167000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3167100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3167200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3167300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3167400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3167500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3167600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0326, - "step": 3167700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3167800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3167900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3168000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0333, - "step": 3168100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3168200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3168300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3168400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3168500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3168600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3168700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3168800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0368, - "step": 3168900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3169000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3169100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3169200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3169300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3169400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3169500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3169600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3169700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3169800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3169900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0325, - "step": 3170000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3170100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3170200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3170300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0289, - "step": 3170400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3170500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3170600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3170700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0293, - "step": 3170800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3170900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3171000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3171100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3171200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3171300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0324, - "step": 3171400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3171500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3171600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3171700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3171800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3171900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3172000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0292, - "step": 3172100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3172200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3172300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0346, - "step": 3172400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3172500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3172600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3172700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0339, - "step": 3172800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3172900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0291, - "step": 3173000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3173100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0304, - "step": 3173200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3173300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3173400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3173500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3173600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3173700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3173800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3173900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3174000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3174100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3174200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3174300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3174400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0323, - "step": 3174500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3174600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3174700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3174800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0322, - "step": 3174900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3175000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3175100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3175200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.031, - "step": 3175300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3175400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3175500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0331, - "step": 3175600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0321, - "step": 3175700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0336, - "step": 3175800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3175900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3176000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3176100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3176200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3176300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0334, - "step": 3176400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3176500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0301, - "step": 3176600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0308, - "step": 3176700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3176800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3176900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0303, - "step": 3177000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0316, - "step": 3177100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0318, - "step": 3177200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3177300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0298, - "step": 3177400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3177500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0302, - "step": 3177600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3177700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0296, - "step": 3177800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0294, - "step": 3177900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3178000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3178100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3178200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0299, - "step": 3178300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0315, - "step": 3178400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3178500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3178600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0313, - "step": 3178700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0309, - "step": 3178800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.033, - "step": 3178900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.032, - "step": 3179000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0311, - "step": 3179100 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0317, - "step": 3179200 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0297, - "step": 3179300 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0314, - "step": 3179400 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0295, - "step": 3179500 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0307, - "step": 3179600 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0306, - "step": 3179700 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3179800 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.0312, - "step": 3179900 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001, - "loss": 0.03, - "step": 3180000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0281524658203125, - "eval_runtime": 3361.7471, - "eval_samples_per_second": 334.565, - "eval_steps_per_second": 20.911, - "step": 3180000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0303, - "step": 3180100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0295, - "step": 3180200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.031, - "step": 3180300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.029, - "step": 3180400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0295, - "step": 3180500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0302, - "step": 3180600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0304, - "step": 3180700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3180800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0299, - "step": 3180900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0303, - "step": 3181000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0292, - "step": 3181100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3181200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.03, - "step": 3181300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3181400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.029, - "step": 3181500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0286, - "step": 3181600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.032, - "step": 3181700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3181800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0296, - "step": 3181900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3182000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3182100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0297, - "step": 3182200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3182300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3182400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0291, - "step": 3182500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3182600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3182700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3182800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0294, - "step": 3182900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0293, - "step": 3183000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3183100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3183200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0293, - "step": 3183300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3183400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0286, - "step": 3183500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0299, - "step": 3183600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3183700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3183800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3183900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3184000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3184100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0286, - "step": 3184200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3184300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3184400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0289, - "step": 3184500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.029, - "step": 3184600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3184700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3184800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3184900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3185000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3185100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3185200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0294, - "step": 3185300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3185400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3185500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3185600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3185700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3185800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0289, - "step": 3185900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3186000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0293, - "step": 3186100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3186200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3186300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3186400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3186500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3186600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3186700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3186800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3186900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.029, - "step": 3187000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3187100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3187200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3187300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3187400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3187500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0294, - "step": 3187600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3187700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3187800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3187900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3188000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3188100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3188200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3188300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3188400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3188500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3188600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3188700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3188800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3188900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3189000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3189100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3189200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3189300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3189400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3189500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3189600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3189700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3189800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3189900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3190000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0305, - "step": 3190100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3190200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3190300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3190400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0288, - "step": 3190500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3190600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3190700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.029, - "step": 3190800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3190900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3191000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3191100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3191200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3191300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3191400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3191500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3191600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0293, - "step": 3191700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3191800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3191900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3192000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3192100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3192200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3192300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3192400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3192500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3192600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3192700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3192800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3192900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3193000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3193100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3193200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3193300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3193400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3193500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3193600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0291, - "step": 3193700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3193800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3193900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3194000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3194100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3194200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3194300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3194400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3194500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0289, - "step": 3194600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3194700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3194800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3194900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3195000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3195100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3195200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3195300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3195400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3195500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3195600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3195700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.029, - "step": 3195800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3195900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3196000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3196100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3196200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3196300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3196400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3196500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3196600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3196700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3196800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3196900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3197000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3197100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3197200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3197300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3197400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3197500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3197600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3197700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3197800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3197900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3198000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3198100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3198200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3198300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3198400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3198500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3198600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3198700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3198800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3198900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3199000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3199100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3199200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0291, - "step": 3199300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3199400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3199500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0295, - "step": 3199600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0291, - "step": 3199700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3199800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3199900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3200000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02618408203125, - "eval_runtime": 3400.2762, - "eval_samples_per_second": 330.774, - "eval_steps_per_second": 20.674, - "step": 3200000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3200100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3200200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3200300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3200400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3200500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3200600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3200700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3200800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3200900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3201000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3201100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3201200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3201300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3201400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3201500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3201600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3201700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3201800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3201900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3202000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3202100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3202200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3202300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3202400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3202500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3202600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3202700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3202800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3202900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3203000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3203100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3203200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3203300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3203400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3203500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3203600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3203700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3203800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3203900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3204000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3204100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3204200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3204300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3204400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3204500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3204600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3204700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3204800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3204900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3205000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3205100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3205200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3205300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3205400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3205500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3205600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3205700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3205800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3205900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3206000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3206100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3206200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0286, - "step": 3206300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3206400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3206500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3206600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3206700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3206800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3206900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3207000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3207100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3207200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3207300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3207400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3207500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3207600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3207700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3207800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3207900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3208000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3208100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3208200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3208300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3208400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3208500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3208600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3208700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3208800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3208900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3209000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3209100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3209200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3209300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3209400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3209500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3209600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3209700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3209800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3209900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3210000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3210100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3210200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3210300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3210400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3210500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3210600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3210700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3210800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3210900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3211000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3211100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3211200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3211300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3211400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3211500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3211600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3211700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3211800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3211900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3212000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3212100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3212200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3212300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3212400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3212500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3212600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3212700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3212800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3212900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3213000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3213100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3213200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3213300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3213400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3213500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3213600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3213700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3213800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3213900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3214000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3214100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3214200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.028, - "step": 3214300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3214400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3214500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3214600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3214700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3214800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3214900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3215000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3215100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3215200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3215300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3215400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3215500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3215600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3215700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3215800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3215900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3216000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3216100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3216200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3216300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3216400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3216500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3216600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3216700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3216800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3216900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3217000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3217100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3217200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3217300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3217400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3217500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3217600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3217700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3217800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3217900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3218000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3218100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3218200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3218300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3218400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3218500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3218600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3218700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3218800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3218900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3219000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3219100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3219200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3219300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3219400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3219500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3219600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3219700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3219800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3219900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3220000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02593994140625, - "eval_runtime": 3278.6598, - "eval_samples_per_second": 343.044, - "eval_steps_per_second": 21.44, - "step": 3220000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3220100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3220200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3220300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3220400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3220500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3220600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3220700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3220800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3220900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3221000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3221100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3221200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3221300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3221400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3221500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3221600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3221700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3221800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3221900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3222000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3222100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3222200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3222300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3222400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3222500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3222600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3222700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3222800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3222900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3223000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3223100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3223200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3223300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3223400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3223500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3223600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3223700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3223800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3223900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3224000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3224100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3224200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3224300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3224400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3224500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3224600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3224700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3224800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3224900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3225000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3225100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3225200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3225300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3225400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3225500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3225600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3225700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3225800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3225900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3226000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3226100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3226200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3226300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3226400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3226500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3226600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3226700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3226800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3226900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3227000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3227100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3227200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3227300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3227400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3227500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3227600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3227700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3227800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3227900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3228000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3228100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3228200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3228300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3228400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3228500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3228600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3228700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3228800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3228900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3229000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3229100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3229200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3229300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3229400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3229500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3229600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3229700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3229800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3229900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3230000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3230100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3230200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3230300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3230400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3230500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3230600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3230700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3230800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3230900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3231000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3231100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3231200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3231300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3231400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3231500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3231600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3231700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3231800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3231900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3232000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3232100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3232200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3232300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3232400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3232500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3232600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3232700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3232800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3232900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3233000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3233100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3233200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3233300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3233400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3233500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3233600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3233700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3233800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3233900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3234000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3234100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3234200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3234300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3234400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3234500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3234600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0286, - "step": 3234700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3234800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3234900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3235000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3235100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3235200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3235300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3235400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3235500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3235600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3235700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3235800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3235900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3236000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3236100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3236200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3236300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3236400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3236500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3236600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3236700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3236800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3236900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3237000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3237100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3237200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3237300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3237400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3237500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3237600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3237700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3237800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3237900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3238000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3238100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3238200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3238300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3238400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3238500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3238600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3238700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3238800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3238900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3239000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3239100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3239200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3239300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3239400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3239500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3239600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3239700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3239800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3239900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3240000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0257415771484375, - "eval_runtime": 3195.2132, - "eval_samples_per_second": 352.002, - "eval_steps_per_second": 22.0, - "step": 3240000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3240100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3240200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3240300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3240400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3240500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3240600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3240700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3240800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3240900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3241000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3241100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3241200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3241300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3241400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3241500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3241600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3241700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3241800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3241900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3242000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3242100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3242200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3242300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3242400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3242500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3242600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3242700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3242800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3242900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3243000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3243100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3243200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3243300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3243400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3243500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3243600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3243700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3243800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3243900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3244000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3244100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3244200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3244300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3244400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3244500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3244600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3244700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3244800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3244900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3245000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3245100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3245200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3245300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3245400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3245500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3245600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3245700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3245800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3245900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3246000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3246100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3246200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3246300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3246400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3246500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3246600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3246700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3246800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3246900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3247000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3247100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3247200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3247300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3247400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3247500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3247600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3247700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3247800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3247900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3248000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3248100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3248200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3248300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3248400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3248500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3248600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3248700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3248800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3248900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3249000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3249100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3249200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3249300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3249400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3249500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3249600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3249700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3249800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3249900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3250000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3250100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3250200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3250300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3250400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3250500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3250600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3250700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3250800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3250900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3251000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3251100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3251200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3251300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3251400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3251500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3251600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3251700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3251800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3251900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3252000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3252100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3252200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3252300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3252400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3252500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3252600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3252700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3252800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3252900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3253000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3253100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3253200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3253300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3253400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3253500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3253600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3253700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3253800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3253900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3254000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3254100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3254200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3254300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3254400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3254500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3254600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3254700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3254800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3254900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3255000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3255100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3255200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3255300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3255400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3255500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3255600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3255700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3255800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3255900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3256000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3256100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3256200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3256300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3256400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3256500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3256600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3256700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3256800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3256900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3257000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3257100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3257200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3257300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3257400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3257500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3257600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3257700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3257800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3257900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3258000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3258100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3258200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3258300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3258400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3258500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3258600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3258700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3258800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3258900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3259000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3259100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3259200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3259300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3259400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3259500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3259600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3259700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3259800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3259900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3260000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0256500244140625, - "eval_runtime": 3120.5744, - "eval_samples_per_second": 360.422, - "eval_steps_per_second": 22.527, - "step": 3260000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3260100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3260200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3260300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3260400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3260500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3260600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3260700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3260800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3260900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3261000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3261100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3261200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3261300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3261400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3261500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3261600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3261700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3261800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3261900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3262000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3262100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3262200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3262300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3262400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3262500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3262600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3262700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3262800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3262900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3263000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3263100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3263200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3263300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3263400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3263500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3263600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3263700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3263800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3263900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3264000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3264100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3264200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3264300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3264400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3264500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3264600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3264700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3264800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3264900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3265000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3265100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3265200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3265300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3265400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3265500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3265600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3265700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3265800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3265900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3266000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3266100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3266200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3266300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3266400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3266500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3266600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3266700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3266800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3266900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3267000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3267100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3267200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3267300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3267400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3267500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3267600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3267700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3267800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3267900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3268000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3268100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3268200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3268300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3268400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3268500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3268600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3268700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3268800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3268900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3269000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3269100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3269200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3269300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3269400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3269500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3269600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3269700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3269800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3269900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3270000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3270100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3270200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3270300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3270400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3270500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3270600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3270700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3270800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3270900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3271000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3271100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3271200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3271300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3271400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3271500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3271600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3271700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3271800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3271900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3272000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3272100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3272200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3272300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3272400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3272500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3272600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3272700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3272800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3272900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3273000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3273100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3273200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3273300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3273400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3273500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3273600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3273700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3273800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3273900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3274000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3274100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3274200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3274300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3274400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3274500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3274600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3274700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3274800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3274900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3275000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3275100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3275200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3275300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3275400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3275500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3275600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3275700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3275800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3275900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3276000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3276100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3276200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3276300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3276400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3276500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3276600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3276700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3276800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3276900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3277000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3277100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3277200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3277300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3277400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3277500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3277600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3277700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3277800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3277900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3278000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3278100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3278200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3278300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3278400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3278500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3278600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3278700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3278800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3278900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3279000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3279100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3279200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3279300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3279400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3279500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3279600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3279700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3279800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3279900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3280000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0255584716796875, - "eval_runtime": 3272.9094, - "eval_samples_per_second": 343.646, - "eval_steps_per_second": 21.478, - "step": 3280000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3280100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3280200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3280300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3280400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3280500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3280600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3280700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3280800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3280900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3281000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3281100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3281200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3281300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3281400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3281500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3281600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3281700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3281800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3281900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3282000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3282100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3282200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3282300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3282400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3282500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3282600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3282700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3282800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0283, - "step": 3282900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3283000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3283100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3283200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3283300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3283400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3283500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3283600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3283700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3283800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3283900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3284000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3284100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3284200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3284300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3284400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3284500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3284600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3284700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3284800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3284900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3285000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3285100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3285200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3285300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3285400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3285500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3285600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3285700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3285800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3285900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3286000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3286100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3286200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3286300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3286400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3286500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3286600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3286700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3286800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3286900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3287000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3287100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3287200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3287300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3287400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3287500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3287600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3287700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3287800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3287900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3288000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3288100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3288200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3288300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3288400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3288500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3288600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3288700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3288800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3288900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3289000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3289100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3289200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3289300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3289400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3289500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3289600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3289700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3289800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3289900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3290000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3290100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3290200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3290300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3290400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3290500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3290600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3290700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3290800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3290900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3291000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3291100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3291200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3291300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3291400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3291500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3291600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3291700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3291800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3291900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3292000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3292100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3292200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3292300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3292400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3292500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3292600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3292700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3292800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3292900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3293000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3293100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3293200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3293300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3293400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3293500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3293600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3293700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3293800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3293900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3294000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3294100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3294200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3294300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3294400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3294500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3294600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3294700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3294800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3294900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3295000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3295100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3295200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3295300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3295400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3295500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3295600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3295700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3295800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3295900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3296000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3296100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3296200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3296300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3296400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3296500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3296600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3296700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3296800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3296900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3297000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3297100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3297200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3297300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3297400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3297500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3297600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3297700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3297800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3297900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3298000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3298100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3298200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3298300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3298400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3298500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3298600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3298700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3298800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3298900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3299000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3299100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3299200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3299300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3299400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3299500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3299600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3299700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3299800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3299900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3300000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025482177734375, - "eval_runtime": 3277.7006, - "eval_samples_per_second": 343.144, - "eval_steps_per_second": 21.447, - "step": 3300000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3300100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3300200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3300300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3300400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3300500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3300600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3300700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3300800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3300900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3301000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3301100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3301200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3301300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3301400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3301500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3301600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3301700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3301800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3301900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3302000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3302100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3302200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3302300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3302400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3302500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3302600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3302700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3302800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3302900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3303000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3303100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3303200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3303300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3303400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3303500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3303600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3303700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3303800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3303900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3304000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3304100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3304200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3304300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3304400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3304500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3304600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3304700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3304800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3304900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3305000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3305100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3305200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3305300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3305400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3305500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3305600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0282, - "step": 3305700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3305800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3305900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3306000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3306100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3306200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3306300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3306400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3306500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3306600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3306700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3306800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3306900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3307000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3307100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3307200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3307300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3307400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3307500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3307600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3307700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3307800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3307900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3308000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3308100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3308200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3308300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3308400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3308500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3308600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3308700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3308800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3308900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3309000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3309100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3309200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3309300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3309400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3309500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3309600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3309700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3309800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3309900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3310000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3310100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3310200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3310300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3310400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3310500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3310600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3310700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3310800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3310900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3311000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3311100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3311200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3311300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3311400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3311500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3311600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3311700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3311800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3311900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3312000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3312100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3312200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3312300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3312400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3312500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3312600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3312700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3312800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3312900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3313000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3313100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3313200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3313300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3313400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3313500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3313600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3313700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3313800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3313900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3314000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3314100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3314200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3314300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3314400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3314500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3314600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3314700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3314800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3314900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3315000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3315100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3315200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3315300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3315400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3315500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3315600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3315700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3315800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3315900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3316000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3316100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3316200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3316300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3316400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3316500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3316600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3316700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3316800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3316900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3317000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3317100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3317200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3317300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3317400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3317500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3317600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3317700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3317800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3317900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3318000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3318100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3318200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0279, - "step": 3318300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3318400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3318500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3318600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3318700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3318800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3318900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3319000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3319100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3319200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3319300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3319400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3319500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3319600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3319700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3319800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0287, - "step": 3319900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3320000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025390625, - "eval_runtime": 3295.1049, - "eval_samples_per_second": 341.331, - "eval_steps_per_second": 21.333, - "step": 3320000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3320100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3320200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3320300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3320400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3320500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3320600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3320700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3320800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3320900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3321000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3321100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3321200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3321300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3321400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3321500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3321600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3321700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3321800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3321900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3322000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3322100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3322200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3322300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3322400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3322500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3322600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3322700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3322800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3322900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3323000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3323100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3323200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3323300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3323400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3323500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0285, - "step": 3323600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3323700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3323800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3323900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3324000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3324100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3324200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3324300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3324400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3324500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3324600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3324700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3324800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3324900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3325000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3325100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3325200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3325300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3325400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3325500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3325600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3325700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3325800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3325900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3326000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3326100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3326200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3326300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3326400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3326500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3326600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3326700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3326800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3326900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3327000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3327100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3327200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3327300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3327400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3327500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3327600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3327700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3327800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3327900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3328000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3328100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3328200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3328300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3328400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3328500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3328600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3328700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3328800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3328900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3329000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3329100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3329200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3329300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3329400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3329500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3329600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3329700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3329800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3329900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3330000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3330100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3330200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3330300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3330400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3330500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3330600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3330700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3330800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3330900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3331000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3331100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3331200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3331300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3331400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3331500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3331600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3331700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3331800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3331900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3332000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3332100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3332200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3332300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3332400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3332500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3332600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3332700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3332800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3332900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3333000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3333100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3333200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3333300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3333400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3333500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3333600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3333700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3333800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3333900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3334000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3334100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3334200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3334300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3334400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3334500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3334600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3334700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3334800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3334900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3335000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3335100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3335200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3335300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3335400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3335500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3335600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3335700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3335800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3335900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3336000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3336100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3336200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3336300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3336400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3336500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3336600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3336700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3336800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3336900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3337000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3337100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3337200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3337300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3337400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3337500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3337600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3337700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3337800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3337900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3338000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3338100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3338200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3338300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3338400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3338500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3338600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3338700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3338800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3338900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3339000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3339100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3339200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3339300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3339400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3339500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3339600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3339700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3339800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3339900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3340000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02532958984375, - "eval_runtime": 3251.3678, - "eval_samples_per_second": 345.923, - "eval_steps_per_second": 21.62, - "step": 3340000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3340100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3340200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3340300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3340400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3340500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3340600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3340700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3340800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3340900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3341000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3341100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3341200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3341300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3341400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3341500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3341600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3341700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3341800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3341900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3342000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3342100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3342200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3342300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3342400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3342500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3342600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3342700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3342800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3342900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3343000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3343100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3343200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3343300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3343400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3343500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3343600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3343700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3343800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3343900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3344000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3344100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3344200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3344300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3344400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3344500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3344600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3344700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3344800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3344900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3345000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3345100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3345200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3345300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3345400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3345500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3345600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3345700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3345800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3345900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3346000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3346100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3346200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3346300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3346400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3346500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3346600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3346700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3346800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3346900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0277, - "step": 3347000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3347100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3347200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3347300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3347400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3347500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3347600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3347700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3347800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3347900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3348000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3348100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3348200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3348300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3348400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3348500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3348600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3348700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3348800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3348900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3349000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3349100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3349200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3349300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3349400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3349500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3349600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3349700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3349800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3349900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3350000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3350100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3350200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3350300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3350400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3350500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3350600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3350700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3350800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3350900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3351000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3351100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3351200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3351300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3351400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3351500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3351600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3351700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3351800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3351900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3352000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3352100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3352200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3352300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3352400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3352500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3352600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3352700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3352800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3352900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3353000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3353100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3353200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3353300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3353400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3353500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3353600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3353700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3353800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3353900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3354000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3354100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3354200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3354300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3354400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3354500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3354600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3354700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3354800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3354900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3355000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3355100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3355200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3355300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3355400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3355500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3355600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3355700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3355800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3355900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3356000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3356100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3356200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3356300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3356400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3356500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3356600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3356700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3356800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3356900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3357000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3357100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3357200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3357300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3357400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3357500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3357600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3357700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3357800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3357900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3358000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3358100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3358200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3358300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3358400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3358500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3358600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3358700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3358800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3358900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3359000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3359100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3359200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3359300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3359400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3359500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3359600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3359700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3359800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3359900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3360000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0253448486328125, - "eval_runtime": 3387.197, - "eval_samples_per_second": 332.051, - "eval_steps_per_second": 20.753, - "step": 3360000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3360100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3360200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3360300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3360400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3360500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3360600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3360700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3360800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3360900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3361000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3361100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3361200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3361300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3361400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3361500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3361600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3361700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3361800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3361900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3362000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3362100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3362200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3362300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3362400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3362500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3362600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3362700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3362800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3362900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3363000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3363100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3363200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3363300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3363400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3363500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3363600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3363700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3363800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3363900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3364000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3364100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3364200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3364300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3364400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3364500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3364600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3364700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3364800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3364900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3365000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3365100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3365200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3365300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3365400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3365500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3365600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3365700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3365800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3365900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3366000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3366100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3366200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3366300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3366400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3366500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3366600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3366700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3366800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3366900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3367000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3367100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3367200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3367300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3367400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3367500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3367600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3367700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3367800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3367900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3368000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3368100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3368200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3368300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3368400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3368500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3368600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3368700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3368800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3368900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3369000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3369100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3369200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3369300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3369400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3369500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3369600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3369700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3369800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3369900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3370000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3370100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3370200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3370300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3370400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3370500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3370600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3370700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3370800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3370900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3371000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3371100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3371200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3371300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3371400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3371500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3371600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3371700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3371800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3371900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3372000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3372100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3372200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3372300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3372400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3372500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3372600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3372700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3372800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3372900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3373000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3373100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3373200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3373300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3373400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3373500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3373600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3373700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3373800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3373900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3374000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3374100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3374200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3374300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3374400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3374500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3374600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3374700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3374800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3374900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0281, - "step": 3375000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3375100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3375200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3375300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3375400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3375500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3375600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3375700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3375800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3375900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3376000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3376100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3376200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3376300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3376400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3376500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3376600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3376700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3376800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3376900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3377000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3377100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3377200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3377300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3377400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3377500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3377600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3377700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3377800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3377900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3378000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3378100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3378200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3378300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3378400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3378500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3378600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3378700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3378800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3378900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3379000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3379100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3379200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3379300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3379400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3379500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3379600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3379700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3379800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3379900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3380000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0252685546875, - "eval_runtime": 3642.4195, - "eval_samples_per_second": 308.785, - "eval_steps_per_second": 19.299, - "step": 3380000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3380100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3380200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3380300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3380400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3380500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3380600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3380700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3380800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3380900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3381000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3381100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3381200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3381300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3381400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3381500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3381600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3381700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3381800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3381900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3382000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3382100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3382200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3382300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3382400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3382500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3382600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3382700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3382800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3382900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3383000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3383100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3383200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3383300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3383400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3383500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3383600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3383700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3383800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3383900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3384000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3384100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3384200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3384300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3384400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3384500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3384600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3384700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3384800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3384900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3385000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3385100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3385200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3385300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3385400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3385500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3385600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3385700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3385800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3385900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3386000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3386100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0278, - "step": 3386200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3386300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3386400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3386500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3386600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3386700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3386800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3386900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3387000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3387100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3387200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3387300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3387400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3387500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3387600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3387700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3387800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3387900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3388000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3388100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3388200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3388300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3388400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3388500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3388600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3388700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3388800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3388900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3389000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3389100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3389200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3389300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3389400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3389500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3389600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3389700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3389800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3389900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3390000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3390100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3390200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3390300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3390400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3390500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3390600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3390700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3390800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3390900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3391000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3391100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3391200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3391300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3391400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3391500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3391600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3391700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3391800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3391900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3392000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3392100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3392200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3392300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3392400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3392500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3392600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3392700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3392800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3392900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3393000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3393100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3393200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3393300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3393400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3393500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3393600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3393700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3393800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3393900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3394000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3394100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3394200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3394300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3394400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3394500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3394600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3394700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3394800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3394900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3395000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3395100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3395200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3395300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3395400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3395500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3395600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3395700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3395800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3395900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3396000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3396100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3396200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3396300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3396400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3396500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3396600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3396700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3396800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3396900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3397000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3397100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3397200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3397300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3397400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3397500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3397600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3397700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3397800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3397900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3398000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3398100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3398200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3398300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3398400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3398500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3398600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3398700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3398800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3398900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3399000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3399100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3399200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3399300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3399400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3399500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3399600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3399700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3399800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3399900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3400000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0252838134765625, - "eval_runtime": 3529.8352, - "eval_samples_per_second": 318.633, - "eval_steps_per_second": 19.915, - "step": 3400000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3400100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3400200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3400300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3400400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3400500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3400600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3400700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3400800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3400900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3401000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3401100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3401200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3401300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3401400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3401500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3401600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3401700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3401800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3401900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3402000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3402100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3402200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3402300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3402400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3402500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3402600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3402700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3402800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3402900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3403000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3403100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3403200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3403300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3403400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3403500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3403600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3403700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3403800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3403900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3404000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3404100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3404200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3404300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3404400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3404500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3404600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3404700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3404800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3404900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3405000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3405100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3405200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3405300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3405400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3405500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3405600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3405700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3405800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3405900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3406000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3406100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3406200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3406300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3406400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3406500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3406600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3406700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3406800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3406900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3407000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3407100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3407200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3407300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3407400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3407500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3407600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3407700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3407800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3407900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3408000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3408100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3408200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3408300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3408400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3408500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3408600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3408700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3408800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3408900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3409000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3409100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3409200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3409300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3409400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3409500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3409600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3409700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3409800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3409900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3410000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3410100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3410200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3410300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3410400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3410500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3410600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3410700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3410800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3410900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3411000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0237, - "step": 3411100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3411200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3411300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3411400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3411500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3411600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3411700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3411800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3411900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3412000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3412100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3412200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3412300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3412400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3412500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3412600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3412700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3412800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3412900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3413000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3413100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3413200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3413300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3413400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3413500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3413600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3413700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3413800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3413900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3414000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3414100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3414200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3414300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3414400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3414500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3414600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3414700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3414800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3414900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3415000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3415100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3415200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3415300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3415400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3415500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3415600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3415700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3415800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3415900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3416000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3416100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3416200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3416300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3416400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3416500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3416600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3416700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3416800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3416900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3417000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3417100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3417200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3417300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3417400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3417500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3417600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3417700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3417800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3417900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3418000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3418100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3418200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3418300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3418400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3418500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3418600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3418700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3418800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3418900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3419000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3419100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3419200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3419300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3419400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3419500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3419600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3419700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3419800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3419900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3420000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025238037109375, - "eval_runtime": 3246.5247, - "eval_samples_per_second": 346.439, - "eval_steps_per_second": 21.653, - "step": 3420000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3420100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3420200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3420300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3420400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3420500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3420600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3420700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3420800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3420900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3421000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3421100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3421200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3421300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3421400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3421500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3421600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3421700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3421800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3421900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3422000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3422100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3422200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3422300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3422400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3422500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3422600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3422700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3422800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3422900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3423000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3423100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3423200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3423300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3423400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3423500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3423600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3423700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3423800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3423900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3424000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3424100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3424200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3424300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3424400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3424500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3424600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3424700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3424800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3424900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3425000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3425100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3425200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3425300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3425400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3425500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3425600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0276, - "step": 3425700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3425800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3425900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3426000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3426100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3426200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3426300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3426400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3426500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3426600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3426700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3426800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3426900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3427000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3427100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3427200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3427300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3427400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3427500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3427600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3427700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3427800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3427900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3428000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3428100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3428200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3428300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3428400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3428500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3428600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3428700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3428800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3428900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3429000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3429100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3429200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3429300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3429400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3429500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3429600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3429700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3429800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3429900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3430000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3430100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3430200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3430300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3430400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3430500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3430600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3430700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3430800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3430900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3431000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3431100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3431200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3431300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3431400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3431500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3431600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3431700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3431800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3431900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3432000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3432100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3432200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3432300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3432400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3432500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3432600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3432700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3432800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3432900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3433000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3433100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3433200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3433300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3433400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3433500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3433600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3433700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3433800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3433900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3434000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3434100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3434200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3434300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3434400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3434500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3434600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3434700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3434800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3434900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3435000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3435100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3435200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3435300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3435400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3435500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3435600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3435700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3435800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3435900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3436000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3436100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3436200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3436300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3436400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3436500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3436600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3436700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3436800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3436900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3437000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3437100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3437200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3437300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3437400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3437500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3437600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3437700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0296, - "step": 3437800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3437900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3438000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3438100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3438200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3438300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3438400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3438500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3438600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3438700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3438800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3438900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3439000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3439100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3439200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3439300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3439400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3439500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3439600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3439700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3439800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3439900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3440000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025177001953125, - "eval_runtime": 3132.3556, - "eval_samples_per_second": 359.066, - "eval_steps_per_second": 22.442, - "step": 3440000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3440100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3440200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3440300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3440400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3440500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3440600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3440700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3440800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3440900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3441000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3441100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3441200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3441300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3441400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3441500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3441600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3441700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3441800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3441900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3442000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3442100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3442200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3442300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3442400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3442500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3442600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3442700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3442800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3442900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3443000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3443100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3443200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3443300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3443400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3443500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3443600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3443700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3443800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3443900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3444000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3444100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3444200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3444300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3444400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3444500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3444600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3444700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3444800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3444900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3445000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3445100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3445200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3445300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3445400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3445500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3445600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3445700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3445800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3445900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3446000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3446100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3446200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3446300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3446400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3446500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3446600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3446700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3446800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3446900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3447000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3447100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3447200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3447300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3447400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3447500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3447600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3447700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3447800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3447900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3448000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3448100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3448200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3448300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3448400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3448500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3448600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3448700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3448800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3448900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3449000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3449100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3449200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3449300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3449400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3449500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3449600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3449700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3449800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3449900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3450000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3450100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3450200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3450300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3450400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3450500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3450600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3450700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3450800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3450900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3451000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3451100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3451200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3451300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3451400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3451500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3451600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3451700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3451800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3451900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3452000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3452100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3452200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3452300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3452400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3452500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3452600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3452700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3452800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3452900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3453000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3453100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3453200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3453300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3453400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3453500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3453600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3453700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3453800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3453900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3454000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3454100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3454200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3454300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3454400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3454500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3454600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3454700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3454800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3454900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3455000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3455100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3455200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3455300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0241, - "step": 3455400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3455500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3455600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3455700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3455800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3455900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3456000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3456100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3456200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3456300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3456400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3456500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3456600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3456700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3456800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3456900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3457000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3457100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3457200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3457300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3457400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3457500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3457600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3457700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3457800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3457900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3458000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3458100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3458200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3458300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3458400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3458500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3458600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3458700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3458800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3458900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3459000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3459100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3459200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3459300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3459400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3459500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3459600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3459700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3459800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3459900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3460000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0251617431640625, - "eval_runtime": 3022.7979, - "eval_samples_per_second": 372.08, - "eval_steps_per_second": 23.255, - "step": 3460000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3460100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3460200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3460300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3460400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3460500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3460600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3460700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3460800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3460900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3461000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3461100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3461200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3461300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3461400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3461500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3461600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3461700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3461800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3461900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3462000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3462100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3462200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3462300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3462400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3462500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3462600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3462700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3462800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3462900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3463000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3463100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3463200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3463300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3463400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3463500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3463600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3463700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3463800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3463900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3464000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3464100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3464200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3464300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3464400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3464500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3464600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3464700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3464800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3464900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3465000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3465100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3465200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3465300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3465400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3465500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3465600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3465700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3465800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3465900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3466000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3466100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3466200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3466300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3466400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3466500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3466600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3466700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3466800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3466900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3467000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3467100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3467200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3467300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3467400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3467500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3467600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3467700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3467800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3467900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3468000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3468100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3468200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3468300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3468400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3468500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3468600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3468700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3468800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3468900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3469000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3469100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3469200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3469300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3469400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3469500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3469600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3469700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3469800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3469900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3470000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3470100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3470200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3470300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3470400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3470500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3470600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3470700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3470800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3470900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3471000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3471100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3471200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3471300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3471400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3471500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3471600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3471700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0273, - "step": 3471800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3471900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3472000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3472100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3472200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3472300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3472400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3472500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3472600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3472700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3472800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3472900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3473000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3473100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3473200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3473300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3473400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3473500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3473600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3473700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3473800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3473900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3474000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3474100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3474200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3474300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3474400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3474500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3474600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3474700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3474800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3474900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3475000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3475100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3475200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3475300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3475400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3475500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3475600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3475700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3475800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3475900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3476000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3476100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3476200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3476300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3476400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3476500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3476600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3476700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3476800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3476900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3477000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3477100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3477200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3477300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3477400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3477500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3477600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3477700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3477800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3477900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3478000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3478100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3478200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3478300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3478400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3478500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3478600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3478700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3478800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3478900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3479000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3479100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3479200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3479300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3479400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3479500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3479600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3479700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3479800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3479900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3480000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0251617431640625, - "eval_runtime": 3063.2627, - "eval_samples_per_second": 367.165, - "eval_steps_per_second": 22.948, - "step": 3480000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3480100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3480200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3480300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3480400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3480500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3480600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3480700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3480800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3480900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3481000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3481100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3481200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3481300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3481400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3481500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3481600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3481700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3481800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3481900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3482000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3482100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3482200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3482300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3482400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3482500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3482600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3482700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3482800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3482900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3483000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3483100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3483200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3483300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3483400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3483500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3483600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3483700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3483800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3483900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3484000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3484100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3484200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3484300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3484400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3484500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3484600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3484700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3484800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3484900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3485000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3485100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3485200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3485300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3485400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3485500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3485600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3485700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3485800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3485900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3486000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3486100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3486200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3486300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3486400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3486500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3486600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3486700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3486800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3486900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3487000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3487100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3487200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3487300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3487400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3487500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3487600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3487700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3487800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3487900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3488000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3488100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3488200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3488300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3488400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3488500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3488600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3488700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3488800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3488900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3489000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3489100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3489200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3489300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3489400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3489500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3489600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3489700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3489800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3489900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3490000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3490100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3490200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3490300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3490400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3490500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3490600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3490700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3490800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3490900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3491000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3491100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3491200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3491300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3491400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3491500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3491600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3491700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3491800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3491900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3492000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3492100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3492200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3492300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3492400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3492500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3492600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3492700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3492800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3492900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3493000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3493100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3493200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3493300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3493400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3493500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3493600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3493700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3493800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3493900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3494000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3494100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3494200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3494300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3494400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3494500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3494600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3494700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3494800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3494900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3495000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3495100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3495200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3495300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3495400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3495500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3495600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3495700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3495800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3495900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3496000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3496100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3496200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3496300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3496400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3496500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3496600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3496700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3496800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3496900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3497000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3497100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3497200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3497300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3497400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3497500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3497600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3497700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3497800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3497900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3498000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3498100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3498200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3498300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3498400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3498500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3498600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3498700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3498800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3498900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3499000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3499100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3499200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3499300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3499400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3499500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3499600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3499700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3499800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3499900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3500000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025115966796875, - "eval_runtime": 4339.9133, - "eval_samples_per_second": 259.158, - "eval_steps_per_second": 16.198, - "step": 3500000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3500100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3500200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3500300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3500400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3500500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0239, - "step": 3500600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3500700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3500800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3500900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3501000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3501100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3501200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3501300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3501400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3501500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3501600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3501700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3501800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3501900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3502000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3502100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3502200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3502300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3502400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3502500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3502600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3502700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3502800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3502900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3503000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3503100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3503200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3503300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3503400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3503500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3503600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3503700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3503800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3503900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3504000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3504100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3504200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3504300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3504400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3504500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3504600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3504700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3504800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3504900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3505000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3505100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3505200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3505300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3505400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3505500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3505600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3505700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3505800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3505900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3506000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3506100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3506200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3506300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3506400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3506500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.024, - "step": 3506600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3506700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3506800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3506900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3507000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3507100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3507200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3507300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3507400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3507500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3507600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3507700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3507800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3507900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3508000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3508100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3508200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3508300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3508400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3508500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3508600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3508700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3508800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3508900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3509000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3509100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3509200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3509300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3509400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3509500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3509600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3509700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3509800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3509900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3510000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3510100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3510200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3510300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3510400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3510500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3510600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3510700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3510800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3510900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3511000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3511100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3511200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3511300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3511400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3511500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3511600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3511700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3511800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3511900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3512000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3512100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3512200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3512300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3512400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3512500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3512600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3512700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3512800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3512900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3513000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3513100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3513200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3513300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3513400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3513500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3513600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3513700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3513800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3513900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3514000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3514100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3514200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3514300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3514400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3514500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3514600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3514700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3514800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3514900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3515000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3515100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3515200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3515300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3515400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3515500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3515600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3515700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3515800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3515900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3516000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3516100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3516200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3516300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3516400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3516500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0239, - "step": 3516600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3516700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3516800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3516900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3517000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3517100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3517200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3517300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3517400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3517500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3517600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3517700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3517800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3517900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3518000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3518100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3518200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3518300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3518400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3518500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3518600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3518700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3518800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3518900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3519000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3519100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3519200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3519300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3519400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3519500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0238, - "step": 3519600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3519700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3519800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3519900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3520000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0251007080078125, - "eval_runtime": 4332.2578, - "eval_samples_per_second": 259.616, - "eval_steps_per_second": 16.226, - "step": 3520000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3520100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3520200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3520300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3520400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3520500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3520600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3520700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3520800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3520900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3521000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3521100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3521200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3521300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3521400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3521500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3521600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3521700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3521800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3521900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3522000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3522100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3522200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3522300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3522400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3522500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3522600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3522700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3522800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3522900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3523000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3523100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3523200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3523300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3523400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3523500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3523600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3523700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3523800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3523900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3524000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3524100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3524200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3524300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3524400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3524500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3524600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3524700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3524800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3524900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3525000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3525100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3525200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3525300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3525400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3525500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3525600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3525700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3525800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3525900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3526000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3526100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3526200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3526300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3526400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3526500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3526600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3526700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3526800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3526900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3527000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3527100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3527200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3527300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3527400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3527500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3527600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3527700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3527800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3527900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3528000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3528100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3528200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3528300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3528400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3528500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3528600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3528700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3528800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3528900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3529000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3529100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3529200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3529300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3529400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3529500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3529600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3529700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3529800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3529900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3530000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3530100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3530200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3530300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3530400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3530500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3530600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3530700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3530800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3530900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3531000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3531100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3531200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3531300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3531400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3531500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3531600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3531700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3531800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3531900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3532000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3532100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3532200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3532300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3532400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3532500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3532600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3532700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3532800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3532900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3533000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3533100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3533200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3533300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3533400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3533500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3533600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3533700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3533800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3533900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3534000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3534100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3534200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3534300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3534400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3534500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3534600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3534700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3534800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3534900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3535000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3535100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3535200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3535300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3535400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3535500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3535600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3535700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3535800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3535900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3536000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3536100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3536200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3536300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3536400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3536500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3536600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3536700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3536800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3536900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3537000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3537100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3537200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3537300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3537400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3537500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3537600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3537700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3537800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3537900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3538000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3538100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3538200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3538300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3538400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3538500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3538600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3538700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3538800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3538900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3539000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3539100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3539200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3539300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3539400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3539500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3539600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3539700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3539800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3539900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3540000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02508544921875, - "eval_runtime": 4211.7411, - "eval_samples_per_second": 267.045, - "eval_steps_per_second": 16.69, - "step": 3540000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3540100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3540200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.024, - "step": 3540300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3540400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3540500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3540600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3540700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3540800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3540900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3541000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3541100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3541200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3541300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3541400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3541500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3541600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3541700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3541800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3541900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3542000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3542100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3542200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3542300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3542400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3542500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3542600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3542700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3542800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3542900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3543000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3543100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3543200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0241, - "step": 3543300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3543400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3543500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3543600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3543700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3543800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3543900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3544000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3544100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3544200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3544300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3544400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3544500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3544600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3544700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3544800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3544900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3545000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3545100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3545200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3545300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3545400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3545500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3545600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3545700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3545800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3545900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3546000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3546100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3546200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3546300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3546400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3546500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3546600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3546700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3546800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3546900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3547000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3547100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3547200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3547300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3547400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3547500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3547600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3547700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3547800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3547900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3548000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3548100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3548200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3548300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3548400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3548500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3548600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3548700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3548800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3548900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3549000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3549100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0238, - "step": 3549200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3549300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3549400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3549500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3549600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3549700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3549800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3549900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3550000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3550100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3550200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3550300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3550400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3550500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3550600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3550700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3550800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3550900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3551000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3551100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3551200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3551300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3551400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3551500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3551600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3551700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3551800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3551900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3552000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3552100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3552200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3552300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3552400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3552500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3552600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3552700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3552800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3552900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3553000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3553100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3553200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3553300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3553400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3553500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3553600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3553700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3553800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3553900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3554000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3554100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3554200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3554300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3554400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3554500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3554600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3554700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3554800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3554900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3555000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3555100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3555200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3555300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3555400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3555500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3555600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3555700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3555800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3555900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3556000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3556100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3556200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3556300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3556400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3556500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3556600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3556700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3556800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3556900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3557000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3557100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3557200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3557300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3557400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3557500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3557600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3557700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3557800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3557900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3558000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3558100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3558200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3558300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3558400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3558500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3558600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3558700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3558800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3558900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3559000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3559100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3559200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3559300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3559400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3559500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3559600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3559700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3559800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3559900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3560000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025054931640625, - "eval_runtime": 3881.6639, - "eval_samples_per_second": 289.753, - "eval_steps_per_second": 18.11, - "step": 3560000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3560100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3560200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3560300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3560400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3560500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3560600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3560700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3560800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3560900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3561000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3561100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3561200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0239, - "step": 3561300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3561400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3561500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3561600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3561700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3561800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3561900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3562000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3562100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3562200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3562300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3562400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3562500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3562600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3562700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3562800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3562900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3563000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3563100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3563200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3563300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3563400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3563500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3563600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3563700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3563800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3563900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3564000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3564100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3564200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3564300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3564400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3564500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3564600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3564700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3564800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3564900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3565000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3565100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3565200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3565300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3565400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3565500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3565600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3565700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3565800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3565900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3566000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3566100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3566200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3566300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3566400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3566500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3566600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3566700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3566800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3566900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3567000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3567100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3567200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3567300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3567400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3567500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3567600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3567700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3567800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3567900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3568000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3568100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3568200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3568300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3568400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3568500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3568600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3568700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3568800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3568900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3569000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3569100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3569200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3569300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3569400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3569500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3569600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3569700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3569800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3569900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3570000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3570100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3570200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3570300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3570400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3570500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3570600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3570700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3570800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3570900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3571000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3571100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3571200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3571300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3571400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3571500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3571600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3571700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3571800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3571900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0272, - "step": 3572000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3572100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3572200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3572300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3572400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3572500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3572600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3572700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3572800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3572900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3573000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3573100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3573200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3573300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3573400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3573500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3573600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3573700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3573800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3573900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3574000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3574100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3574200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3574300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3574400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3574500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3574600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3574700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3574800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3574900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3575000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3575100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3575200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3575300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3575400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3575500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3575600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3575700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3575800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3575900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3576000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3576100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3576200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3576300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3576400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3576500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3576600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3576700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3576800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3576900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3577000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3577100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3577200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3577300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3577400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3577500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3577600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3577700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3577800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3577900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3578000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3578100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3578200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3578300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3578400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3578500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3578600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3578700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3578800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3578900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3579000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3579100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3579200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3579300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3579400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3579500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3579600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3579700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3579800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3579900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3580000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0250244140625, - "eval_runtime": 3782.3415, - "eval_samples_per_second": 297.362, - "eval_steps_per_second": 18.585, - "step": 3580000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3580100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3580200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3580300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3580400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3580500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3580600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3580700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0274, - "step": 3580800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3580900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3581000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3581100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3581200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3581300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3581400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.024, - "step": 3581500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3581600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3581700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3581800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3581900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3582000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3582100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3582200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3582300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3582400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3582500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3582600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3582700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3582800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3582900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3583000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3583100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3583200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3583300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3583400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3583500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3583600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3583700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3583800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3583900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3584000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3584100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3584200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3584300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3584400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3584500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3584600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3584700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3584800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3584900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3585000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3585100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3585200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3585300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3585400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3585500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3585600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3585700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0232, - "step": 3585800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3585900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3586000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3586100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3586200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3586300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3586400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3586500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3586600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3586700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3586800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3586900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3587000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0275, - "step": 3587100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0284, - "step": 3587200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3587300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3587400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3587500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3587600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3587700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3587800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3587900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3588000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3588100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3588200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3588300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3588400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3588500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3588600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3588700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3588800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3588900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3589000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3589100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3589200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3589300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3589400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3589500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3589600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3589700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3589800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3589900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3590000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3590100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3590200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3590300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3590400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3590500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3590600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3590700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3590800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3590900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3591000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3591100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3591200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3591300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3591400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3591500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3591600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3591700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3591800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3591900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3592000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3592100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3592200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3592300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3592400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3592500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3592600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3592700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3592800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3592900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3593000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3593100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3593200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3593300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3593400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3593500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3593600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3593700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3593800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3593900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3594000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3594100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3594200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3594300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3594400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3594500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0271, - "step": 3594600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3594700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3594800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3594900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3595000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0237, - "step": 3595100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3595200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3595300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3595400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3595500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3595600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3595700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3595800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3595900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3596000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3596100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3596200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3596300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3596400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3596500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3596600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3596700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3596800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3596900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3597000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3597100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3597200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3597300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3597400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3597500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3597600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3597700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3597800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3597900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3598000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3598100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3598200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3598300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3598400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3598500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3598600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0241, - "step": 3598700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3598800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3598900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3599000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3599100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3599200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3599300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3599400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3599500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3599600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3599700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3599800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3599900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3600000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0250396728515625, - "eval_runtime": 3978.3043, - "eval_samples_per_second": 282.714, - "eval_steps_per_second": 17.67, - "step": 3600000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3600100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3600200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3600300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3600400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3600500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3600600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3600700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3600800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3600900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3601000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3601100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3601200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3601300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3601400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3601500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3601600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3601700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3601800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3601900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3602000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3602100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3602200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3602300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3602400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3602500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3602600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3602700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3602800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3602900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3603000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3603100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3603200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3603300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3603400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3603500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3603600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3603700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3603800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3603900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3604000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3604100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3604200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3604300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3604400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3604500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3604600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3604700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3604800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3604900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3605000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3605100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3605200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3605300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3605400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3605500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3605600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3605700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3605800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3605900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3606000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3606100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3606200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3606300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3606400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3606500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3606600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3606700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3606800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3606900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3607000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3607100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3607200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3607300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3607400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3607500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3607600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3607700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3607800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3607900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3608000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3608100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3608200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3608300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3608400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3608500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3608600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3608700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3608800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3608900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3609000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3609100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3609200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3609300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3609400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3609500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3609600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3609700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3609800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3609900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3610000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3610100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3610200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3610300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3610400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3610500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3610600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3610700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3610800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3610900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3611000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3611100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3611200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3611300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3611400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3611500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3611600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3611700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3611800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3611900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3612000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3612100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3612200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3612300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3612400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3612500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3612600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3612700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0269, - "step": 3612800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3612900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3613000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3613100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3613200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3613300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3613400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3613500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3613600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3613700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3613800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3613900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3614000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3614100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3614200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3614300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3614400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3614500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3614600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3614700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3614800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3614900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3615000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3615100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3615200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3615300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3615400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3615500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3615600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3615700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3615800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3615900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3616000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3616100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3616200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3616300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3616400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3616500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3616600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3616700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3616800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3616900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3617000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3617100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3617200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3617300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3617400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0241, - "step": 3617500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3617600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3617700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3617800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3617900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3618000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3618100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3618200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3618300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3618400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3618500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3618600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3618700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3618800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3618900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3619000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3619100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3619200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3619300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3619400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3619500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3619600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3619700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3619800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3619900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3620000 - }, - { - "epoch": 0.0, - "eval_loss": 0.025054931640625, - "eval_runtime": 4907.8924, - "eval_samples_per_second": 229.166, - "eval_steps_per_second": 14.323, - "step": 3620000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3620100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3620200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3620300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3620400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3620500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3620600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3620700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3620800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3620900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3621000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3621100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3621200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3621300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3621400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3621500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3621600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3621700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3621800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3621900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3622000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0268, - "step": 3622100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3622200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3622300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3622400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3622500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3622600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3622700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3622800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3622900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3623000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3623100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3623200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3623300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3623400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3623500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3623600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3623700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3623800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3623900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3624000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3624100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3624200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3624300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3624400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3624500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3624600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3624700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3624800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.027, - "step": 3624900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3625000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3625100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3625200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3625300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3625400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3625500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3625600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3625700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3625800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3625900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3626000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3626100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3626200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3626300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3626400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3626500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0242, - "step": 3626600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3626700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3626800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3626900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3627000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3627100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3627200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3627300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3627400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3627500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3627600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3627700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3627800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3627900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3628000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3628100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3628200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3628300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3628400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3628500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0262, - "step": 3628600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3628700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3628800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3628900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3629000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0239, - "step": 3629100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3629200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3629300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3629400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3629500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3629600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3629700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3629800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3629900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3630000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0264, - "step": 3630100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0259, - "step": 3630200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3630300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3630400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3630500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3630600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3630700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3630800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3630900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3631000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3631100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3631200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3631300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3631400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3631500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3631600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3631700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0263, - "step": 3631800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3631900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3632000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3632100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0266, - "step": 3632200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3632300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3632400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3632500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3632600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3632700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3632800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3632900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3633000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3633100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3633200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3633300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3633400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3633500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3633600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3633700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3633800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3633900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3634000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3634100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3634200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3634300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3634400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3634500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3634600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3634700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3634800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3634900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3635000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3635100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0244, - "step": 3635200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3635300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3635400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0253, - "step": 3635500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3635600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3635700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3635800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3635900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3636000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3636100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3636200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3636300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3636400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3636500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3636600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3636700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3636800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3636900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3637000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3637100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3637200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0258, - "step": 3637300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.026, - "step": 3637400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3637500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0265, - "step": 3637600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0252, - "step": 3637700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3637800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0247, - "step": 3637900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3638000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0261, - "step": 3638100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3638200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0256, - "step": 3638300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3638400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3638500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0248, - "step": 3638600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0254, - "step": 3638700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.025, - "step": 3638800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3638900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0255, - "step": 3639000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3639100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0243, - "step": 3639200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0257, - "step": 3639300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0246, - "step": 3639400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0245, - "step": 3639500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3639600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0251, - "step": 3639700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0241, - "step": 3639800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0249, - "step": 3639900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-05, - "loss": 0.0267, - "step": 3640000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0250091552734375, - "eval_runtime": 5536.8339, - "eval_samples_per_second": 203.135, - "eval_steps_per_second": 12.696, - "step": 3640000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3640100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3640200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3640300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3640400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3640500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3640600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3640700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3640800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3640900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3641000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3641100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3641200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3641300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3641400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3641500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0269, - "step": 3641600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3641700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3641800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3641900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3642000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3642100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3642200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3642300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3642400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3642500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3642600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0275, - "step": 3642700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3642800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3642900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3643000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3643100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3643200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3643300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3643400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3643500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3643600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3643700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3643800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3643900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3644000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3644100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3644200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3644300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3644400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3644500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3644600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3644700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3644800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3644900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3645000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3645100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3645200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3645300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3645400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3645500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3645600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3645700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3645800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3645900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3646000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3646100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3646200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3646300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3646400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3646500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3646600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3646700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3646800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3646900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3647000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3647100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3647200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3647300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3647400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3647500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3647600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3647700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3647800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3647900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3648000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3648100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3648200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3648300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3648400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3648500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3648600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3648700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3648800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3648900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3649000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3649100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3649200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3649300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3649400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0269, - "step": 3649500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3649600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3649700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3649800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3649900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3650000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3650100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3650200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3650300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3650400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3650500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3650600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3650700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3650800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3650900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3651000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3651100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3651200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3651300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3651400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3651500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3651600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3651700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3651800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3651900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3652000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3652100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3652200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3652300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3652400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3652500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3652600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3652700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3652800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3652900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3653000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3653100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3653200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3653300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3653400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3653500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3653600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3653700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3653800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3653900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3654000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3654100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3654200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3654300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3654400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3654500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3654600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3654700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3654800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3654900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3655000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3655100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3655200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3655300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3655400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3655500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3655600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3655700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3655800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3655900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3656000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3656100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3656200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3656300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3656400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3656500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3656600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3656700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3656800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3656900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3657000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3657100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3657200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3657300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3657400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3657500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3657600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3657700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3657800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3657900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3658000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3658100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3658200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3658300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3658400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3658500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3658600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3658700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3658800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3658900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3659000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3659100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3659200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3659300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3659400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3659500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3659600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3659700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3659800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3659900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3660000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0248870849609375, - "eval_runtime": 140.1439, - "eval_samples_per_second": 356.776, - "eval_steps_per_second": 22.299, - "step": 3660000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3660100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3660200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3660300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3660400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3660500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3660600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3660700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3660800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3660900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3661000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3661100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3661200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3661300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3661400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3661500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3661600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3661700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3661800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3661900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3662000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3662100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3662200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3662300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3662400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3662500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3662600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3662700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3662800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3662900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3663000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0269, - "step": 3663100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3663200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3663300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3663400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3663500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3663600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3663700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3663800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3663900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3664000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3664100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3664200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3664300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3664400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3664500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3664600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3664700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3664800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3664900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3665000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3665100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3665200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3665300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3665400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3665500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3665600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3665700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3665800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3665900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3666000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3666100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3666200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3666300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3666400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3666500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0235, - "step": 3666600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3666700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3666800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3666900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3667000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3667100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3667200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3667300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3667400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3667500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3667600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3667700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3667800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3667900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3668000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3668100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3668200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3668300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3668400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3668500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3668600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3668700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3668800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3668900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3669000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3669100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3669200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3669300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3669400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3669500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3669600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3669700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3669800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3669900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0271, - "step": 3670000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3670100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3670200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0277, - "step": 3670300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3670400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3670500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3670600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3670700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3670800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3670900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3671000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3671100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3671200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3671300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3671400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3671500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3671600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3671700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3671800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3671900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3672000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3672100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3672200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3672300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3672400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3672500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3672600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3672700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3672800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3672900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3673000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3673100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3673200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3673300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3673400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3673500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3673600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3673700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3673800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3673900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3674000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3674100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3674200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3674300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3674400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3674500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3674600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3674700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3674800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3674900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3675000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3675100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3675200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3675300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3675400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3675500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3675600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3675700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3675800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3675900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3676000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3676100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3676200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3676300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3676400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3676500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3676600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3676700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3676800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3676900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3677000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3677100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3677200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3677300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3677400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3677500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3677600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3677700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3677800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3677900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3678000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3678100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3678200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3678300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3678400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3678500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3678600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3678700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3678800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3678900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3679000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3679100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3679200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3679300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3679400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3679500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3679600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3679700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3679800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3679900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3680000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0248565673828125, - "eval_runtime": 139.6097, - "eval_samples_per_second": 358.141, - "eval_steps_per_second": 22.384, - "step": 3680000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3680100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3680200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3680300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3680400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3680500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3680600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3680700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3680800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3680900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3681000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3681100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3681200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3681300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3681400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3681500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3681600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3681700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3681800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3681900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3682000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3682100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3682200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3682300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3682400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3682500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3682600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3682700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3682800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3682900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3683000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3683100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3683200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3683300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3683400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3683500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3683600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3683700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3683800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3683900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3684000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3684100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3684200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3684300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3684400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3684500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3684600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3684700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3684800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3684900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3685000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3685100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3685200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3685300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3685400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3685500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3685600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3685700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3685800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3685900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3686000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3686100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3686200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3686300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3686400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3686500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3686600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3686700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3686800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3686900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3687000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3687100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3687200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3687300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3687400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3687500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3687600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3687700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3687800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3687900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3688000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3688100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3688200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3688300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3688400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3688500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3688600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3688700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3688800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3688900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3689000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3689100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3689200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3689300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3689400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3689500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3689600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3689700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3689800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3689900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3690000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3690100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3690200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3690300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3690400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3690500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3690600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3690700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3690800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3690900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3691000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3691100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3691200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3691300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3691400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3691500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3691600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3691700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3691800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3691900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3692000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3692100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3692200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3692300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3692400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3692500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3692600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3692700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3692800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3692900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3693000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3693100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3693200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3693300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3693400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3693500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3693600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3693700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3693800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3693900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3694000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3694100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3694200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3694300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3694400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3694500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3694600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3694700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3694800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3694900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3695000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3695100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3695200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3695300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3695400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3695500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3695600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3695700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3695800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3695900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3696000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3696100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3696200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3696300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3696400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3696500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3696600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3696700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3696800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3696900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3697000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3697100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3697200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3697300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3697400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3697500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3697600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3697700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3697800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3697900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3698000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3698100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3698200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3698300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3698400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3698500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3698600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3698700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3698800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3698900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3699000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3699100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3699200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3699300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3699400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3699500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3699600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3699700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3699800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3699900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3700000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02484130859375, - "eval_runtime": 131.8971, - "eval_samples_per_second": 379.083, - "eval_steps_per_second": 23.693, - "step": 3700000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3700100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3700200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3700300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3700400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3700500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3700600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3700700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3700800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3700900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3701000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3701100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3701200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3701300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3701400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3701500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3701600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3701700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3701800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3701900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3702000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3702100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3702200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3702300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3702400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3702500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3702600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3702700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3702800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3702900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3703000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3703100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3703200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3703300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3703400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3703500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3703600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3703700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3703800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3703900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3704000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3704100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3704200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3704300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3704400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3704500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3704600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3704700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0269, - "step": 3704800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3704900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3705000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3705100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3705200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3705300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3705400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3705500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3705600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3705700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3705800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3705900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3706000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3706100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3706200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3706300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3706400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3706500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3706600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3706700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3706800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3706900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3707000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3707100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3707200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3707300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3707400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3707500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3707600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3707700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3707800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3707900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3708000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3708100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3708200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3708300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3708400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3708500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3708600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3708700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3708800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3708900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3709000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3709100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3709200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3709300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3709400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3709500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3709600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3709700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3709800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3709900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3710000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3710100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3710200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3710300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3710400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3710500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3710600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3710700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3710800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3710900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3711000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3711100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3711200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3711300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3711400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3711500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3711600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3711700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3711800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3711900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3712000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3712100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3712200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3712300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3712400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3712500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3712600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3712700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3712800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3712900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3713000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3713100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3713200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3713300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3713400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3713500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3713600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3713700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3713800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3713900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3714000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3714100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3714200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3714300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3714400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3714500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3714600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3714700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3714800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3714900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3715000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3715100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3715200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3715300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3715400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3715500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3715600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3715700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3715800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3715900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3716000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3716100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3716200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3716300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3716400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3716500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3716600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3716700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3716800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3716900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3717000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3717100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3717200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3717300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3717400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3717500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3717600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3717700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3717800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3717900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3718000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3718100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3718200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3718300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3718400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3718500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3718600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3718700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3718800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3718900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3719000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3719100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3719200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3719300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3719400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3719500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3719600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3719700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3719800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3719900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3720000 - }, - { - "epoch": 0.0, - "eval_loss": 0.02484130859375, - "eval_runtime": 132.2399, - "eval_samples_per_second": 378.101, - "eval_steps_per_second": 23.631, - "step": 3720000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3720100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3720200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3720300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3720400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3720500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3720600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3720700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3720800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3720900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3721000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3721100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3721200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3721300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3721400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3721500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3721600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3721700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3721800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3721900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3722000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3722100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3722200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3722300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3722400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3722500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3722600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3722700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3722800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3722900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3723000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3723100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3723200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3723300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3723400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3723500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3723600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3723700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3723800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3723900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3724000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3724100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3724200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3724300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3724400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3724500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0229, - "step": 3724600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3724700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3724800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3724900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3725000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3725100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3725200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3725300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3725400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3725500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3725600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3725700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0235, - "step": 3725800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3725900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3726000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3726100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3726200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3726300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3726400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3726500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3726600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3726700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3726800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3726900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3727000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3727100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3727200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3727300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3727400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3727500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3727600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3727700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3727800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3727900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3728000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3728100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3728200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3728300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3728400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3728500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3728600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3728700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3728800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3728900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3729000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3729100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3729200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3729300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3729400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3729500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3729600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3729700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3729800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3729900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3730000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3730100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3730200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3730300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3730400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3730500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3730600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3730700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3730800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3730900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3731000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3731100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3731200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3731300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3731400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3731500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3731600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3731700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3731800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3731900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3732000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3732100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3732200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3732300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3732400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3732500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3732600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3732700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3732800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3732900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3733000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3733100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3733200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3733300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3733400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3733500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3733600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3733700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3733800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3733900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3734000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3734100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3734200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3734300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3734400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3734500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3734600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3734700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3734800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3734900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3735000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3735100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3735200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3735300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3735400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3735500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3735600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3735700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3735800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3735900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3736000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3736100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3736200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3736300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3736400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3736500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3736600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3736700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3736800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3736900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3737000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3737100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3737200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3737300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3737400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3737500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3737600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3737700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3737800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3737900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3738000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3738100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3738200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3738300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3738400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3738500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3738600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3738700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3738800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3738900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3739000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3739100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3739200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3739300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3739400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3739500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3739600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3739700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3739800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3739900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3740000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247955322265625, - "eval_runtime": 134.092, - "eval_samples_per_second": 372.878, - "eval_steps_per_second": 23.305, - "step": 3740000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3740100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3740200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3740300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3740400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3740500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3740600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3740700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3740800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3740900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3741000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3741100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3741200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3741300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3741400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3741500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3741600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3741700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3741800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3741900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3742000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3742100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3742200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3742300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3742400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3742500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3742600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3742700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3742800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3742900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3743000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3743100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3743200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3743300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3743400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3743500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3743600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3743700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3743800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3743900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3744000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3744100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3744200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3744300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3744400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3744500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3744600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3744700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3744800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3744900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3745000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3745100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3745200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3745300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3745400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3745500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3745600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3745700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3745800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3745900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3746000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3746100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3746200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3746300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3746400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3746500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3746600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3746700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3746800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3746900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3747000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3747100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3747200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3747300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3747400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3747500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3747600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3747700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3747800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3747900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3748000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3748100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3748200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3748300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3748400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3748500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3748600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3748700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3748800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3748900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3749000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3749100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3749200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3749300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3749400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3749500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3749600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3749700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3749800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3749900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3750000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3750100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3750200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3750300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3750400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3750500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3750600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3750700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3750800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3750900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3751000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3751100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3751200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3751300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3751400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3751500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3751600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3751700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3751800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3751900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3752000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3752100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3752200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3752300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3752400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3752500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3752600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3752700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3752800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3752900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3753000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3753100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3753200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3753300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3753400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3753500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3753600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3753700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3753800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3753900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3754000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3754100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3754200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3754300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3754400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3754500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3754600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3754700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3754800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3754900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3755000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3755100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3755200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3755300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3755400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3755500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3755600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3755700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3755800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3755900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3756000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3756100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3756200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3756300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3756400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3756500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3756600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3756700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3756800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3756900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3757000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3757100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3757200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3757300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3757400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3757500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3757600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3757700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3757800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3757900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3758000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3758100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3758200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3758300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3758400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3758500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3758600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3758700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3758800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3758900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3759000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3759100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3759200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3759300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3759400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3759500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3759600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3759700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3759800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3759900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3760000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024810791015625, - "eval_runtime": 133.8117, - "eval_samples_per_second": 373.66, - "eval_steps_per_second": 23.354, - "step": 3760000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3760100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3760200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3760300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3760400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3760500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3760600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3760700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3760800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3760900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3761000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3761100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3761200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3761300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3761400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3761500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3761600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3761700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3761800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3761900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3762000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3762100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3762200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3762300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3762400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3762500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3762600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3762700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3762800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3762900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3763000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3763100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3763200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3763300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3763400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3763500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3763600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3763700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3763800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3763900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3764000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3764100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3764200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3764300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3764400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3764500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3764600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3764700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3764800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3764900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3765000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3765100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3765200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3765300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3765400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3765500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3765600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3765700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3765800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3765900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3766000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3766100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3766200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3766300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3766400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3766500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3766600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3766700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3766800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3766900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3767000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3767100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3767200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3767300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3767400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3767500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3767600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3767700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3767800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3767900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3768000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3768100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3768200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3768300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3768400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3768500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3768600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3768700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3768800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3768900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3769000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3769100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3769200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3769300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3769400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3769500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3769600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3769700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3769800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3769900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3770000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3770100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3770200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3770300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3770400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3770500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3770600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3770700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3770800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3770900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3771000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3771100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3771200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3771300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3771400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3771500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3771600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3771700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3771800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3771900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3772000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3772100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3772200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3772300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3772400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3772500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3772600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3772700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3772800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3772900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3773000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3773100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3773200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3773300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3773400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3773500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3773600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3773700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3773800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3773900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3774000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3774100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3774200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3774300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3774400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3774500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3774600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3774700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3774800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0271, - "step": 3774900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3775000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3775100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3775200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3775300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3775400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3775500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3775600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3775700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3775800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3775900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3776000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3776100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3776200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3776300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3776400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3776500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3776600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3776700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3776800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3776900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3777000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3777100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3777200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3777300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3777400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3777500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3777600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3777700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3777800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3777900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3778000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3778100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3778200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3778300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3778400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3778500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3778600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3778700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3778800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3778900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3779000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3779100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3779200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3779300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3779400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3779500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3779600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3779700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3779800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3779900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3780000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024810791015625, - "eval_runtime": 129.0238, - "eval_samples_per_second": 387.525, - "eval_steps_per_second": 24.22, - "step": 3780000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3780100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3780200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3780300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3780400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3780500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3780600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3780700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3780800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3780900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3781000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3781100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3781200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3781300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3781400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3781500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3781600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3781700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3781800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3781900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3782000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3782100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3782200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3782300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3782400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3782500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3782600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3782700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3782800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0273, - "step": 3782900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3783000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3783100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3783200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3783300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3783400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3783500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3783600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3783700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3783800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3783900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3784000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3784100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3784200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3784300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3784400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3784500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3784600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3784700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3784800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3784900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3785000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3785100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3785200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3785300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3785400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3785500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3785600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3785700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3785800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3785900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3786000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3786100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3786200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3786300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3786400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3786500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3786600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3786700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3786800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3786900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3787000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3787100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3787200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3787300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3787400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3787500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3787600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3787700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3787800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3787900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3788000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3788100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3788200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3788300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3788400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3788500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3788600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3788700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3788800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3788900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3789000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3789100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3789200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3789300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3789400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3789500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3789600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3789700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3789800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3789900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3790000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3790100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3790200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3790300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3790400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3790500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3790600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3790700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3790800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3790900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3791000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3791100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3791200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3791300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3791400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3791500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3791600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3791700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3791800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3791900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3792000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3792100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3792200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3792300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3792400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3792500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3792600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3792700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3792800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3792900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3793000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3793100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3793200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3793300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3793400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3793500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3793600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3793700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3793800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3793900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3794000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3794100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3794200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3794300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3794400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3794500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3794600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3794700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3794800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3794900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3795000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3795100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3795200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3795300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3795400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3795500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3795600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3795700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3795800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3795900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3796000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3796100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3796200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3796300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3796400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3796500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3796600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3796700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3796800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3796900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3797000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3797100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3797200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3797300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3797400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3797500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3797600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3797700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3797800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3797900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3798000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3798100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3798200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3798300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3798400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3798500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3798600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3798700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3798800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3798900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3799000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3799100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3799200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3799300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3799400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3799500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3799600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3799700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3799800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3799900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3800000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247955322265625, - "eval_runtime": 133.8194, - "eval_samples_per_second": 373.638, - "eval_steps_per_second": 23.352, - "step": 3800000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3800100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3800200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3800300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3800400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3800500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3800600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3800700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3800800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3800900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3801000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3801100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3801200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3801300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3801400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3801500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3801600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3801700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3801800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3801900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3802000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3802100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3802200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3802300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3802400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3802500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3802600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3802700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3802800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3802900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3803000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3803100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3803200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3803300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3803400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3803500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0267, - "step": 3803600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3803700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3803800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3803900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3804000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3804100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3804200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3804300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3804400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3804500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3804600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3804700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3804800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3804900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3805000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3805100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3805200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3805300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3805400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3805500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3805600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3805700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3805800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3805900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3806000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3806100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3806200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0235, - "step": 3806300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3806400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3806500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3806600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3806700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3806800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3806900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3807000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3807100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3807200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3807300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3807400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3807500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3807600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3807700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3807800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3807900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3808000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3808100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3808200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3808300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3808400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3808500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3808600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3808700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3808800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3808900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3809000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3809100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3809200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3809300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3809400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3809500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3809600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3809700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3809800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3809900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3810000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3810100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3810200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3810300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3810400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3810500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3810600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3810700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3810800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3810900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3811000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3811100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3811200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3811300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3811400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3811500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3811600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3811700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3811800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3811900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3812000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3812100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3812200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3812300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3812400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3812500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3812600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3812700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3812800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3812900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3813000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3813100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3813200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3813300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3813400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3813500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3813600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3813700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3813800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3813900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3814000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3814100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3814200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3814300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3814400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3814500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3814600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3814700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3814800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3814900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3815000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3815100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3815200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3815300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3815400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3815500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3815600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3815700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3815800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3815900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3816000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3816100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3816200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3816300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3816400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3816500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3816600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3816700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3816800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3816900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3817000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3817100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3817200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3817300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3817400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3817500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3817600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3817700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3817800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3817900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3818000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3818100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3818200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3818300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3818400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3818500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3818600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3818700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3818800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3818900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3819000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3819100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3819200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3819300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3819400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3819500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3819600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3819700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3819800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3819900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3820000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247802734375, - "eval_runtime": 126.1573, - "eval_samples_per_second": 396.331, - "eval_steps_per_second": 24.771, - "step": 3820000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3820100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3820200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3820300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3820400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3820500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3820600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3820700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3820800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3820900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3821000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3821100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3821200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3821300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3821400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3821500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3821600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3821700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3821800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3821900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3822000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3822100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3822200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3822300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3822400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3822500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3822600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3822700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3822800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3822900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3823000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3823100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3823200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3823300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3823400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3823500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3823600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3823700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3823800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3823900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3824000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3824100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3824200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3824300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3824400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3824500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3824600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3824700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3824800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0234, - "step": 3824900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3825000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3825100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3825200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3825300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3825400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3825500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3825600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3825700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3825800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3825900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3826000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3826100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3826200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3826300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3826400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3826500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3826600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3826700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3826800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3826900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3827000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3827100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3827200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3827300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3827400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3827500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3827600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3827700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3827800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3827900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3828000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3828100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3828200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3828300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3828400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3828500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3828600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3828700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3828800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3828900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3829000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3829100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3829200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3829300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3829400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3829500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3829600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3829700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3829800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3829900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3830000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3830100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3830200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3830300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3830400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3830500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3830600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3830700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3830800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3830900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3831000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3831100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3831200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3831300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3831400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3831500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3831600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3831700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3831800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3831900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3832000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3832100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3832200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3832300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3832400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3832500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3832600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3832700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3832800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3832900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3833000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3833100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3833200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3833300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3833400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3833500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3833600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3833700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3833800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3833900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3834000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3834100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3834200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3834300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3834400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3834500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3834600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3834700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3834800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3834900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3835000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3835100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3835200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3835300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3835400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3835500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3835600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3835700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3835800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3835900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3836000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3836100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3836200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3836300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3836400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3836500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3836600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3836700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3836800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3836900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3837000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3837100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3837200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3837300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3837400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3837500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3837600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3837700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3837800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3837900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3838000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3838100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3838200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3838300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3838400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3838500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3838600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3838700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3838800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3838900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3839000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3839100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3839200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3839300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3839400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3839500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3839600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3839700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3839800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3839900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3840000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247955322265625, - "eval_runtime": 136.6451, - "eval_samples_per_second": 365.911, - "eval_steps_per_second": 22.869, - "step": 3840000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3840100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3840200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3840300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3840400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3840500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3840600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3840700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3840800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3840900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3841000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3841100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3841200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3841300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3841400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3841500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3841600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3841700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3841800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3841900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3842000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3842100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3842200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3842300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3842400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3842500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3842600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3842700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3842800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3842900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3843000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3843100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3843200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3843300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3843400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3843500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3843600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3843700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3843800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3843900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3844000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3844100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3844200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3844300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3844400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3844500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3844600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3844700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3844800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3844900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3845000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3845100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3845200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3845300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3845400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3845500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3845600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3845700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3845800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3845900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3846000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3846100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3846200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0265, - "step": 3846300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3846400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0268, - "step": 3846500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3846600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3846700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3846800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3846900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3847000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3847100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3847200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3847300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3847400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3847500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3847600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3847700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3847800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3847900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3848000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3848100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3848200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3848300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3848400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3848500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3848600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3848700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3848800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0242, - "step": 3848900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3849000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3849100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3849200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3849300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3849400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3849500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3849600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3849700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3849800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3849900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3850000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3850100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3850200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3850300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3850400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3850500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3850600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3850700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3850800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3850900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3851000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3851100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3851200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3851300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3851400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3851500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3851600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3851700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3851800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3851900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3852000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3852100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3852200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3852300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3852400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3852500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3852600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3852700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3852800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3852900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3853000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3853100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3853200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3853300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3853400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0264, - "step": 3853500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3853600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3853700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3853800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3853900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3854000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3854100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3854200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3854300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3854400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3854500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3854600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3854700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3854800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3854900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3855000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3855100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3855200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3855300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3855400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3855500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3855600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3855700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3855800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3855900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3856000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3856100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3856200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3856300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3856400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3856500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3856600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3856700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3856800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3856900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3857000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3857100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3857200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3857300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3857400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3857500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3857600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3857700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3857800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3857900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3858000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3858100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3858200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3858300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3858400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3858500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3858600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3858700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3858800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3858900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3859000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3859100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3859200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3859300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3859400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3859500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3859600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3859700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3859800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3859900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3860000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247802734375, - "eval_runtime": 141.9744, - "eval_samples_per_second": 352.176, - "eval_steps_per_second": 22.011, - "step": 3860000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3860100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3860200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3860300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3860400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3860500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3860600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3860700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3860800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3860900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3861000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3861100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3861200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3861300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3861400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3861500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3861600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3861700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3861800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3861900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3862000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3862100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3862200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3862300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3862400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3862500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3862600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3862700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3862800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3862900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3863000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0237, - "step": 3863100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3863200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3863300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3863400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3863500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3863600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3863700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3863800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3863900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3864000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3864100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3864200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3864300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3864400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3864500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3864600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3864700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3864800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3864900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3865000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3865100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3865200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3865300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3865400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.027, - "step": 3865500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3865600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3865700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3865800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3865900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3866000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3866100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3866200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3866300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3866400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3866500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3866600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3866700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3866800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3866900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3867000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3867100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3867200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3867300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3867400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3867500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3867600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3867700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3867800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3867900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3868000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3868100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3868200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3868300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3868400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3868500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3868600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.026, - "step": 3868700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3868800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3868900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3869000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3869100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3869200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3869300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3869400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3869500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3869600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3869700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3869800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3869900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3870000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3870100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3870200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3870300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3870400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3870500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3870600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3870700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3870800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3870900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3871000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3871100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0266, - "step": 3871200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3871300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3871400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3871500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3871600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3871700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3871800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0262, - "step": 3871900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3872000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0254, - "step": 3872100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3872200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3872300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3872400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3872500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3872600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3872700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3872800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3872900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3873000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3873100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3873200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3873300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3873400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3873500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3873600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3873700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3873800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3873900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3874000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0263, - "step": 3874100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3874200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3874300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3874400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0261, - "step": 3874500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3874600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0256, - "step": 3874700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3874800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3874900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3875000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3875100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3875200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3875300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3875400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3875500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3875600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3875700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3875800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3875900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3876000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3876100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3876200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0248, - "step": 3876300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3876400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0255, - "step": 3876500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3876600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3876700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3876800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3876900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3877000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3877100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3877200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3877300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0259, - "step": 3877400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0239, - "step": 3877500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0244, - "step": 3877600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3877700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3877800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.024, - "step": 3877900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0251, - "step": 3878000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3878100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3878200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0243, - "step": 3878300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3878400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0249, - "step": 3878500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0253, - "step": 3878600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3878700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0238, - "step": 3878800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.025, - "step": 3878900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0257, - "step": 3879000 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3879100 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0246, - "step": 3879200 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0236, - "step": 3879300 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0247, - "step": 3879400 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0258, - "step": 3879500 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3879600 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0245, - "step": 3879700 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0252, - "step": 3879800 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0234, - "step": 3879900 - }, - { - "epoch": 0.0, - "learning_rate": 1e-06, - "loss": 0.0241, - "step": 3880000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247955322265625, - "eval_runtime": 141.1964, - "eval_samples_per_second": 354.117, - "eval_steps_per_second": 22.132, - "step": 3880000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3880100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3880200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3880300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3880400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3880500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3880600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3880700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3880800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3880900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3881000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3881100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3881200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3881300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3881400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3881500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3881600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3881700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3881800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3881900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3882000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3882100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3882200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3882300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3882400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3882500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3882600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3882700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3882800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3882900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3883000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3883100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3883200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3883300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3883400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3883500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3883600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3883700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3883800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3883900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3884000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3884100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3884200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3884300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3884400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3884500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3884600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3884700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3884800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3884900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3885000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0232, - "step": 3885100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3885200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3885300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3885400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3885500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3885600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3885700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3885800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3885900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3886000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3886100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3886200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3886300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3886400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3886500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3886600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3886700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3886800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3886900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3887000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3887100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3887200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3887300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3887400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3887500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3887600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3887700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3887800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3887900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3888000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3888100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3888200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3888300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3888400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3888500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.027, - "step": 3888600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3888700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3888800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0234, - "step": 3888900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0267, - "step": 3889000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3889100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3889200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3889300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3889400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3889500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3889600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3889700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3889800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3889900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3890000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3890100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3890200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3890300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0272, - "step": 3890400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3890500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3890600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3890700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3890800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3890900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3891000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3891100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3891200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3891300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3891400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3891500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3891600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3891700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3891800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3891900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3892000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3892100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3892200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3892300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3892400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3892500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3892600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3892700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3892800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3892900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3893000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3893100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3893200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0265, - "step": 3893300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3893400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3893500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3893600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3893700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3893800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3893900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3894000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3894100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3894200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3894300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3894400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3894500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3894600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3894700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3894800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3894900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3895000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3895100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3895200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3895300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3895400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3895500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3895600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3895700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3895800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3895900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3896000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3896100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0265, - "step": 3896200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3896300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3896400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3896500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3896600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3896700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3896800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3896900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3897000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3897100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3897200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3897300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3897400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3897500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3897600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3897700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3897800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3897900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3898000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3898100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3898200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3898300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3898400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3898500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3898600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3898700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3898800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3898900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3899000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3899100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3899200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3899300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3899400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3899500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3899600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3899700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3899800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3899900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3900000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247802734375, - "eval_runtime": 138.0188, - "eval_samples_per_second": 362.27, - "eval_steps_per_second": 22.642, - "step": 3900000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3900100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3900200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3900300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3900400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3900500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3900600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3900700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3900800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3900900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3901000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3901100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3901200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3901300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3901400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3901500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3901600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3901700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3901800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3901900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3902000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3902100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3902200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3902300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3902400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3902500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3902600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3902700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3902800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3902900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3903000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3903100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3903200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3903300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3903400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3903500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3903600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3903700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3903800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3903900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3904000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3904100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3904200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3904300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3904400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3904500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3904600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3904700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3904800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3904900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3905000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3905100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3905200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3905300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3905400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3905500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3905600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3905700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3905800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3905900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3906000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3906100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3906200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3906300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3906400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3906500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3906600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3906700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3906800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3906900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3907000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3907100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3907200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3907300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0267, - "step": 3907400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3907500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3907600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3907700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3907800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3907900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3908000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3908100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3908200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3908300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3908400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3908500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3908600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3908700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3908800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3908900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3909000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3909100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3909200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3909300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3909400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3909500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3909600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3909700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3909800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3909900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3910000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3910100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3910200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3910300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3910400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3910500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3910600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3910700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3910800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3910900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3911000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3911100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3911200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3911300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3911400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3911500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3911600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3911700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3911800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3911900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3912000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3912100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3912200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3912300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3912400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3912500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3912600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3912700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3912800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3912900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3913000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3913100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3913200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3913300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3913400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3913500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3913600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3913700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3913800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3913900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3914000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3914100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0267, - "step": 3914200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3914300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3914400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3914500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3914600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3914700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3914800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3914900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3915000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3915100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3915200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3915300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3915400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3915500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3915600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3915700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3915800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3915900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3916000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3916100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3916200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3916300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3916400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3916500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3916600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3916700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3916800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3916900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3917000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3917100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3917200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3917300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3917400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3917500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3917600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3917700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3917800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3917900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3918000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3918100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3918200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3918300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3918400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3918500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3918600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3918700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3918800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3918900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3919000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3919100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3919200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3919300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3919400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3919500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3919600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3919700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3919800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3919900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3920000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247802734375, - "eval_runtime": 139.7293, - "eval_samples_per_second": 357.835, - "eval_steps_per_second": 22.365, - "step": 3920000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3920100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3920200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3920300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3920400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3920500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3920600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3920700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3920800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3920900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3921000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3921100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3921200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3921300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3921400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3921500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3921600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3921700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3921800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3921900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3922000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3922100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3922200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3922300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3922400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3922500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3922600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3922700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3922800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3922900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3923000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3923100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3923200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3923300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3923400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3923500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3923600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3923700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3923800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3923900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3924000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3924100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3924200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3924300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3924400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3924500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3924600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3924700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3924800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3924900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3925000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3925100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0265, - "step": 3925200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3925300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3925400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3925500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3925600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3925700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3925800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3925900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3926000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3926100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3926200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3926300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3926400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3926500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3926600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3926700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3926800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3926900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3927000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3927100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3927200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3927300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3927400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3927500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3927600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3927700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3927800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3927900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3928000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3928100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3928200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3928300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3928400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3928500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3928600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3928700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3928800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3928900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3929000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3929100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3929200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3929300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3929400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3929500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3929600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3929700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3929800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3929900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3930000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3930100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3930200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3930300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3930400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3930500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3930600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0235, - "step": 3930700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3930800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3930900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3931000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3931100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3931200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3931300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3931400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3931500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3931600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3931700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3931800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3931900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3932000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3932100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3932200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3932300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3932400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3932500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3932600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3932700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3932800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3932900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3933000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3933100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3933200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3933300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3933400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3933500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3933600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3933700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3933800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3933900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3934000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3934100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3934200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3934300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3934400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3934500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3934600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3934700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3934800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3934900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3935000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3935100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3935200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3935300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3935400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3935500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3935600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3935700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3935800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3935900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3936000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3936100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3936200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3936300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3936400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3936500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3936600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3936700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3936800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3936900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3937000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3937100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3937200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3937300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3937400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3937500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3937600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3937700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3937800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3937900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3938000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3938100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3938200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3938300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3938400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3938500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3938600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3938700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3938800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3938900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3939000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3939100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3939200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3939300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3939400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3939500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3939600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3939700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3939800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3939900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3940000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247650146484375, - "eval_runtime": 138.4031, - "eval_samples_per_second": 361.264, - "eval_steps_per_second": 22.579, - "step": 3940000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3940100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3940200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3940300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3940400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3940500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3940600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3940700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3940800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3940900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3941000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3941100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3941200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3941300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3941400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3941500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3941600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3941700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3941800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3941900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3942000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3942100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0236, - "step": 3942200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3942300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3942400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3942500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3942600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3942700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3942800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3942900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3943000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3943100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3943200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3943300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3943400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3943500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3943600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3943700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3943800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3943900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3944000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3944100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3944200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3944300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0265, - "step": 3944400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3944500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3944600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3944700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3944800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3944900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3945000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3945100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3945200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3945300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0266, - "step": 3945400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3945500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3945600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3945700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3945800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3945900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3946000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3946100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3946200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3946300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3946400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3946500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3946600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3946700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3946800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3946900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3947000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3947100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3947200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3947300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3947400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3947500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3947600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3947700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3947800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3947900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3948000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3948100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3948200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3948300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3948400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3948500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3948600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3948700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3948800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3948900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3949000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3949100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3949200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3949300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3949400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3949500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3949600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3949700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3949800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3949900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3950000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3950100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3950200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3950300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3950400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3950500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3950600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0235, - "step": 3950700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3950800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3950900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3951000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3951100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3951200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3951300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3951400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3951500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3951600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3951700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3951800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3951900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3952000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3952100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3952200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3952300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3952400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3952500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3952600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3952700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3952800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3952900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3953000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3953100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3953200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3953300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3953400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3953500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3953600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3953700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3953800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0265, - "step": 3953900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3954000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3954100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3954200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3954300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3954400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3954500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3954600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3954700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3954800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3954900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3955000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3955100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3955200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3955300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3955400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3955500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3955600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3955700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3955800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3955900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3956000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3956100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3956200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3956300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3956400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3956500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3956600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3956700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3956800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3956900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3957000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3957100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3957200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3957300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3957400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 3957500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3957600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3957700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3957800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3957900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3958000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3958100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3958200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3958300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3958400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3958500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3958600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3958700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3958800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3958900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3959000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3959100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3959200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0266, - "step": 3959300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3959400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3959500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3959600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3959700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3959800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3959900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3960000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247650146484375, - "eval_runtime": 143.2715, - "eval_samples_per_second": 348.988, - "eval_steps_per_second": 21.812, - "step": 3960000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3960100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3960200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3960300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3960400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3960500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3960600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3960700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3960800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3960900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3961000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3961100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3961200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3961300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3961400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3961500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3961600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3961700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3961800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3961900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3962000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3962100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3962200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3962300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3962400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3962500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3962600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3962700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3962800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3962900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3963000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3963100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3963200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3963300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3963400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3963500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3963600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3963700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3963800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3963900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3964000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3964100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3964200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3964300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3964400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3964500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3964600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3964700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3964800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3964900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3965000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3965100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3965200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3965300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3965400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3965500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3965600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3965700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3965800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3965900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3966000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3966100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3966200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3966300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3966400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3966500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3966600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3966700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3966800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3966900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3967000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3967100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3967200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3967300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3967400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3967500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3967600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3967700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3967800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3967900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3968000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3968100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3968200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3968300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3968400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3968500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3968600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.027, - "step": 3968700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3968800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3968900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3969000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3969100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3969200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3969300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3969400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3969500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3969600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3969700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3969800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3969900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3970000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3970100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3970200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3970300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3970400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3970500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3970600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3970700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3970800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3970900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3971000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3971100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3971200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3971300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3971400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3971500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3971600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3971700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3971800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3971900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3972000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3972100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3972200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3972300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3972400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3972500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3972600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3972700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3972800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3972900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3973000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3973100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3973200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3973300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3973400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3973500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3973600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3973700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3973800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3973900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3974000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3974100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3974200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3974300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3974400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3974500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3974600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3974700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3974800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3974900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3975000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3975100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3975200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3975300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3975400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3975500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3975600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3975700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0269, - "step": 3975800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3975900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3976000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3976100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3976200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3976300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3976400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3976500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3976600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3976700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3976800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3976900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3977000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3977100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3977200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3977300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3977400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3977500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3977600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3977700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3977800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3977900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3978000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3978100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3978200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3978300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3978400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3978500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3978600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3978700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3978800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3978900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3979000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3979100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3979200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3979300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3979400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3979500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3979600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3979700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3979800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3979900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3980000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247650146484375, - "eval_runtime": 141.3953, - "eval_samples_per_second": 353.618, - "eval_steps_per_second": 22.101, - "step": 3980000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3980100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3980200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3980300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3980400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3980500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3980600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3980700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3980800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3980900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3981000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3981100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3981200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3981300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3981400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3981500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3981600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3981700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3981800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3981900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3982000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3982100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3982200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3982300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3982400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3982500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3982600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3982700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3982800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3982900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3983000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3983100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 3983200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3983300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3983400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3983500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3983600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 3983700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3983800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3983900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3984000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 3984100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3984200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3984300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3984400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3984500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3984600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3984700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3984800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3984900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3985000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3985100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 3985200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3985300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3985400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3985500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3985600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3985700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3985800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3985900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3986000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3986100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3986200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3986300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3986400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3986500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3986600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3986700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 3986800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3986900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3987000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3987100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3987200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3987300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3987400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3987500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3987600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3987700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3987800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3987900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3988000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3988100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3988200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3988300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3988400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3988500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3988600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3988700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3988800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3988900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3989000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3989100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3989200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3989300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3989400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3989500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3989600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3989700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3989800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3989900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3990000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3990100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3990200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 3990300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 3990400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3990500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3990600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3990700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3990800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3990900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3991000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3991100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3991200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3991300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0272, - "step": 3991400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3991500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3991600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3991700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3991800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3991900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3992000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3992100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3992200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3992300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3992400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3992500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3992600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3992700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3992800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3992900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3993000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 3993100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3993200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3993300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3993400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3993500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3993600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3993700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3993800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3993900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3994000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3994100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3994200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3994300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3994400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3994500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3994600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3994700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3994800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3994900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3995000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3995100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3995200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 3995300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3995400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3995500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3995600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3995700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3995800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3995900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 3996000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3996100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 3996200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3996300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3996400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 3996500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 3996600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3996700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 3996800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 3996900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3997000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3997100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3997200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3997300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 3997400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0235, - "step": 3997500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3997600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3997700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3997800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3997900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 3998000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3998100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3998200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 3998300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3998400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3998500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3998600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 3998700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3998800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 3998900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3999000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 3999100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 3999200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 3999300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3999400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 3999500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3999600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 3999700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 3999800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 3999900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4000000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024749755859375, - "eval_runtime": 134.1841, - "eval_samples_per_second": 372.622, - "eval_steps_per_second": 23.289, - "step": 4000000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4000100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4000200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4000300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4000400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4000500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4000600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4000700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4000800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4000900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4001000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4001100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4001200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4001300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4001400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4001500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4001600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0265, - "step": 4001700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4001800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4001900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4002000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4002100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4002200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4002300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4002400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4002500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4002600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4002700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4002800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4002900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4003000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4003100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4003200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4003300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4003400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4003500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4003600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4003700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4003800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4003900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4004000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4004100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4004200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4004300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4004400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4004500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4004600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4004700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4004800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4004900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4005000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4005100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4005200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4005300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4005400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4005500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4005600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4005700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4005800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4005900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4006000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4006100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4006200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4006300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4006400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4006500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4006600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4006700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4006800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4006900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4007000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4007100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4007200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4007300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4007400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4007500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4007600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4007700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0235, - "step": 4007800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0234, - "step": 4007900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4008000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4008100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4008200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4008300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4008400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4008500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4008600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4008700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4008800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4008900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4009000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4009100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4009200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4009300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4009400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4009500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4009600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4009700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4009800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4009900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4010000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4010100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4010200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4010300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4010400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4010500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4010600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4010700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4010800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4010900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4011000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4011100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4011200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4011300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4011400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4011500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4011600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4011700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4011800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4011900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4012000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4012100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4012200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4012300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4012400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4012500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4012600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4012700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4012800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4012900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4013000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4013100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4013200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4013300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4013400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4013500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4013600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4013700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4013800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4013900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4014000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4014100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4014200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4014300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4014400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4014500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4014600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4014700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4014800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4014900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4015000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4015100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4015200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4015300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4015400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4015500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4015600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4015700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4015800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4015900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4016000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4016100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4016200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4016300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4016400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4016500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4016600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4016700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4016800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4016900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4017000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4017100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4017200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4017300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4017400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4017500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4017600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4017700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4017800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4017900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4018000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4018100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4018200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4018300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4018400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4018500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4018600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4018700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4018800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4018900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4019000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4019100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4019200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4019300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4019400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4019500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4019600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4019700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4019800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4019900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4020000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024749755859375, - "eval_runtime": 136.2543, - "eval_samples_per_second": 366.961, - "eval_steps_per_second": 22.935, - "step": 4020000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4020100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4020200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4020300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0273, - "step": 4020400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4020500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4020600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4020700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4020800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4020900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4021000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4021100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4021200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0234, - "step": 4021300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4021400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4021500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4021600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4021700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4021800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4021900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4022000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4022100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4022200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0272, - "step": 4022300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4022400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4022500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4022600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4022700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4022800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4022900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4023000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4023100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4023200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4023300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4023400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4023500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4023600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4023700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4023800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4023900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4024000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4024100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4024200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4024300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4024400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4024500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4024600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4024700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4024800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4024900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4025000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4025100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4025200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0235, - "step": 4025300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4025400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4025500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4025600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4025700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4025800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0267, - "step": 4025900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4026000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4026100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4026200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4026300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4026400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4026500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4026600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4026700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4026800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4026900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4027000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4027100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4027200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4027300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4027400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4027500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4027600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4027700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4027800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4027900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4028000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4028100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4028200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4028300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4028400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4028500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4028600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4028700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4028800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4028900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4029000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4029100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4029200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4029300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4029400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4029500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4029600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4029700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4029800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4029900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4030000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4030100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4030200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4030300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4030400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4030500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4030600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4030700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4030800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4030900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4031000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4031100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4031200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4031300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0271, - "step": 4031400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4031500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4031600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4031700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4031800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4031900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4032000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4032100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4032200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4032300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4032400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4032500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4032600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4032700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4032800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4032900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4033000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0267, - "step": 4033100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4033200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4033300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4033400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4033500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4033600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4033700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4033800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4033900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4034000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4034100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4034200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4034300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4034400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4034500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4034600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4034700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4034800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4034900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4035000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4035100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4035200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4035300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4035400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4035500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4035600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4035700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4035800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4035900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4036000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4036100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4036200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4036300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4036400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4036500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4036600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4036700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4036800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4036900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4037000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4037100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4037200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4037300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4037400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4037500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4037600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4037700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4037800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4037900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4038000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4038100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4038200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4038300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4038400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4038500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4038600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4038700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4038800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4038900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4039000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4039100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4039200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0269, - "step": 4039300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4039400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4039500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4039600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4039700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4039800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4039900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4040000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024749755859375, - "eval_runtime": 136.601, - "eval_samples_per_second": 366.029, - "eval_steps_per_second": 22.877, - "step": 4040000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4040100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4040200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4040300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4040400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4040500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4040600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4040700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4040800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4040900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4041000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4041100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4041200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4041300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4041400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4041500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4041600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4041700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4041800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4041900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4042000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4042100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0232, - "step": 4042200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4042300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4042400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4042500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4042600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4042700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4042800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4042900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4043000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4043100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4043200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4043300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4043400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4043500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4043600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4043700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4043800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4043900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4044000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4044100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4044200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4044300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4044400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4044500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4044600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4044700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4044800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4044900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4045000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4045100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4045200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4045300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4045400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4045500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4045600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4045700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4045800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4045900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4046000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4046100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4046200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4046300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4046400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4046500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4046600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4046700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4046800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4046900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4047000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4047100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4047200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4047300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4047400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4047500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4047600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4047700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4047800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4047900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4048000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4048100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4048200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4048300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4048400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4048500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4048600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4048700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4048800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4048900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4049000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4049100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4049200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4049300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4049400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4049500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4049600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4049700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4049800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4049900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4050000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4050100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4050200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4050300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4050400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4050500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4050600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4050700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4050800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4050900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4051000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4051100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 4051200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4051300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4051400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4051500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4051600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4051700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4051800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4051900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4052000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4052100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4052200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4052300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4052400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4052500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4052600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4052700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4052800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4052900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4053000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4053100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4053200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4053300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4053400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4053500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4053600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4053700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4053800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4053900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4054000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4054100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4054200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4054300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4054400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4054500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4054600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4054700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4054800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4054900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4055000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0276, - "step": 4055100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4055200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4055300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4055400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4055500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4055600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4055700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4055800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4055900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4056000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4056100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4056200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4056300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4056400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4056500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4056600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0234, - "step": 4056700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4056800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4056900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4057000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4057100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4057200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4057300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4057400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4057500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4057600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4057700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4057800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4057900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4058000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4058100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4058200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4058300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4058400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4058500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0236, - "step": 4058600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4058700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4058800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4058900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4059000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4059100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4059200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4059300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4059400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4059500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4059600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4059700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4059800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4059900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4060000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024749755859375, - "eval_runtime": 140.5228, - "eval_samples_per_second": 355.814, - "eval_steps_per_second": 22.238, - "step": 4060000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4060100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4060200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4060300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4060400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4060500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4060600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4060700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4060800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4060900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4061000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4061100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4061200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4061300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4061400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4061500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4061600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4061700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4061800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4061900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4062000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4062100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4062200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4062300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4062400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4062500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4062600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4062700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4062800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4062900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4063000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4063100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4063200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4063300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4063400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4063500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4063600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4063700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4063800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4063900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4064000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4064100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4064200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4064300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4064400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4064500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4064600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4064700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4064800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4064900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4065000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4065100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4065200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4065300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4065400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4065500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4065600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4065700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4065800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4065900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4066000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4066100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4066200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4066300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4066400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4066500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4066600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4066700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4066800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4066900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4067000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4067100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4067200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4067300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4067400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4067500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4067600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4067700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4067800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4067900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4068000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4068100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4068200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4068300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4068400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4068500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4068600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4068700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4068800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4068900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4069000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4069100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4069200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4069300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4069400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4069500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4069600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4069700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4069800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4069900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4070000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4070100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4070200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4070300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0236, - "step": 4070400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4070500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4070600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4070700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4070800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4070900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4071000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4071100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4071200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4071300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4071400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4071500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4071600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4071700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4071800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4071900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 4072000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4072100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0264, - "step": 4072200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4072300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4072400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4072500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4072600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4072700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4072800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4072900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4073000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4073100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4073200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4073300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4073400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4073500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4073600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4073700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4073800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4073900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4074000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4074100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4074200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4074300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4074400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4074500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4074600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4074700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4074800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4074900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4075000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4075100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4075200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4075300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4075400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4075500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4075600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4075700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4075800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4075900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4076000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4076100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4076200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4076300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4076400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4076500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4076600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4076700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4076800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4076900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4077000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4077100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4077200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4077300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4077400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4077500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4077600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4077700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4077800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4077900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4078000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4078100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4078200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4078300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4078400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4078500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4078600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4078700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0266, - "step": 4078800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4078900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4079000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4079100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4079200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4079300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4079400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4079500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4079600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4079700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4079800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4079900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4080000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024749755859375, - "eval_runtime": 137.9953, - "eval_samples_per_second": 362.331, - "eval_steps_per_second": 22.646, - "step": 4080000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4080100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4080200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4080300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4080400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4080500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4080600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4080700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4080800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4080900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4081000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4081100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4081200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4081300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4081400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4081500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4081600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4081700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4081800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4081900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4082000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4082100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4082200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4082300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4082400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4082500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4082600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4082700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4082800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4082900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4083000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4083100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4083200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4083300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4083400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4083500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4083600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4083700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4083800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4083900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4084000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4084100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4084200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4084300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4084400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4084500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4084600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4084700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4084800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4084900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4085000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4085100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4085200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4085300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4085400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4085500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4085600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4085700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4085800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4085900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4086000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4086100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4086200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4086300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4086400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4086500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4086600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4086700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4086800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4086900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4087000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4087100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4087200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4087300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4087400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4087500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4087600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4087700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4087800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4087900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4088000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4088100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4088200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4088300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4088400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4088500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4088600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4088700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4088800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4088900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4089000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4089100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4089200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4089300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4089400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4089500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4089600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4089700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4089800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4089900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4090000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4090100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4090200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4090300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4090400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4090500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4090600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4090700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4090800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4090900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0231, - "step": 4091000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4091100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4091200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4091300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4091400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4091500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4091600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4091700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4091800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4091900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4092000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4092100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4092200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4092300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4092400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0263, - "step": 4092500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4092600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4092700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4092800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4092900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4093000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4093100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4093200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4093300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4093400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4093500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4093600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4093700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4093800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4093900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4094000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4094100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4094200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4094300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4094400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4094500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4094600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4094700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4094800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4094900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4095000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4095100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4095200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4095300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4095400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4095500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4095600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0235, - "step": 4095700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4095800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4095900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4096000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4096100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4096200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4096300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4096400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4096500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4096600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4096700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4096800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4096900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4097000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4097100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4097200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4097300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4097400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4097500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4097600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4097700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4097800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4097900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4098000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4098100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4098200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4098300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4098400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4098500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4098600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4098700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4098800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4098900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4099000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4099100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4099200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4099300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4099400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4099500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4099600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4099700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4099800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4099900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4100000 - }, - { - "epoch": 0.0, - "eval_loss": 0.0247650146484375, - "eval_runtime": 143.0486, - "eval_samples_per_second": 349.532, - "eval_steps_per_second": 21.846, - "step": 4100000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4100100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4100200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4100300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4100400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4100500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4100600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4100700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4100800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4100900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4101000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4101100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4101200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4101300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4101400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4101500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4101600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4101700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4101800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4101900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4102000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4102100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0238, - "step": 4102200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4102300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4102400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4102500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4102600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0266, - "step": 4102700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4102800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4102900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4103000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4103100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4103200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4103300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4103400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4103500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4103600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4103700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4103800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4103900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4104000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4104100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4104200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0236, - "step": 4104300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4104400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4104500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4104600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4104700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4104800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4104900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4105000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4105100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4105200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4105300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4105400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4105500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4105600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4105700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4105800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4105900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4106000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4106100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4106200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4106300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4106400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4106500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4106600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4106700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4106800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0262, - "step": 4106900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4107000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4107100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4107200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4107300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4107400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4107500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4107600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0267, - "step": 4107700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4107800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4107900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4108000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4108100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4108200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4108300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4108400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4108500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4108600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4108700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4108800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4108900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4109000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4109100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4109200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4109300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4109400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4109500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4109600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4109700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4109800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4109900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4110000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4110100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4110200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4110300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0239, - "step": 4110400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4110500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4110600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4110700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4110800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.026, - "step": 4110900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4111000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4111100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4111200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4111300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4111400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4111500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0261, - "step": 4111600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4111700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4111800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4111900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4112000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4112100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4112200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4112300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4112400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4112500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4112600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4112700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4112800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4112900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4113000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4113100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4113200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4113300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4113400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4113500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4113600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4113700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4113800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4113900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4114000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4114100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4114200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4114300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4114400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4114500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4114600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4114700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0232, - "step": 4114800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4114900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4115000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4115100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0271, - "step": 4115200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4115300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4115400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0258, - "step": 4115500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0245, - "step": 4115600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4115700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4115800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4115900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0242, - "step": 4116000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4116100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4116200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4116300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4116400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4116500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4116600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0259, - "step": 4116700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0243, - "step": 4116800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4116900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0237, - "step": 4117000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0244, - "step": 4117100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4117200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0256, - "step": 4117300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0253, - "step": 4117400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4117500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4117600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4117700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4117800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4117900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4118000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4118100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4118200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4118300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4118400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0257, - "step": 4118500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4118600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0247, - "step": 4118700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4118800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4118900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4119000 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0246, - "step": 4119100 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0241, - "step": 4119200 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0249, - "step": 4119300 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0254, - "step": 4119400 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0255, - "step": 4119500 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.025, - "step": 4119600 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0248, - "step": 4119700 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.024, - "step": 4119800 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0251, - "step": 4119900 - }, - { - "epoch": 0.0, - "learning_rate": 5e-07, - "loss": 0.0252, - "step": 4120000 - }, - { - "epoch": 0.0, - "eval_loss": 0.024749755859375, - "eval_runtime": 142.3614, - "eval_samples_per_second": 351.219, - "eval_steps_per_second": 21.951, - "step": 4120000 - } - ], - "logging_steps": 100, - "max_steps": 100000000, - "num_input_tokens_seen": 0, - "num_train_epochs": 9223372036854775807, - "save_steps": 20000, - "total_flos": 9.902118862348878e+18, - "train_batch_size": null, - "trial_name": null, - "trial_params": null -}