| { |
| "best_global_step": 565, |
| "best_metric": 0.1914851814508438, |
| "best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_cb_1757340166/checkpoint-565", |
| "epoch": 20.0, |
| "eval_steps": 113, |
| "global_step": 2260, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04424778761061947, |
| "grad_norm": 189.6726837158203, |
| "learning_rate": 8.849557522123894e-07, |
| "loss": 7.3383, |
| "num_input_tokens_seen": 1248, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.08849557522123894, |
| "grad_norm": 185.91140747070312, |
| "learning_rate": 1.991150442477876e-06, |
| "loss": 6.8463, |
| "num_input_tokens_seen": 2864, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.13274336283185842, |
| "grad_norm": 136.40625, |
| "learning_rate": 3.097345132743363e-06, |
| "loss": 5.6631, |
| "num_input_tokens_seen": 4160, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.17699115044247787, |
| "grad_norm": 95.61026763916016, |
| "learning_rate": 4.2035398230088504e-06, |
| "loss": 3.7906, |
| "num_input_tokens_seen": 5552, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.22123893805309736, |
| "grad_norm": 83.75438690185547, |
| "learning_rate": 5.3097345132743365e-06, |
| "loss": 3.0376, |
| "num_input_tokens_seen": 6848, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.26548672566371684, |
| "grad_norm": 62.11867141723633, |
| "learning_rate": 6.415929203539823e-06, |
| "loss": 1.2539, |
| "num_input_tokens_seen": 8016, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.30973451327433627, |
| "grad_norm": 64.035400390625, |
| "learning_rate": 7.52212389380531e-06, |
| "loss": 1.0994, |
| "num_input_tokens_seen": 9408, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.35398230088495575, |
| "grad_norm": 51.03561019897461, |
| "learning_rate": 8.628318584070797e-06, |
| "loss": 0.4845, |
| "num_input_tokens_seen": 10736, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.39823008849557523, |
| "grad_norm": 113.1018295288086, |
| "learning_rate": 9.734513274336284e-06, |
| "loss": 0.5447, |
| "num_input_tokens_seen": 11984, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.4424778761061947, |
| "grad_norm": 15.160123825073242, |
| "learning_rate": 1.0840707964601771e-05, |
| "loss": 0.3824, |
| "num_input_tokens_seen": 13360, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.48672566371681414, |
| "grad_norm": 22.409584045410156, |
| "learning_rate": 1.1946902654867258e-05, |
| "loss": 0.5106, |
| "num_input_tokens_seen": 14800, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5309734513274337, |
| "grad_norm": 54.432952880859375, |
| "learning_rate": 1.3053097345132745e-05, |
| "loss": 0.2694, |
| "num_input_tokens_seen": 16192, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5752212389380531, |
| "grad_norm": 160.48135375976562, |
| "learning_rate": 1.415929203539823e-05, |
| "loss": 0.2188, |
| "num_input_tokens_seen": 18160, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6194690265486725, |
| "grad_norm": 6.330208778381348, |
| "learning_rate": 1.5265486725663717e-05, |
| "loss": 0.1425, |
| "num_input_tokens_seen": 19520, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6637168141592921, |
| "grad_norm": 49.4870491027832, |
| "learning_rate": 1.6371681415929206e-05, |
| "loss": 0.6564, |
| "num_input_tokens_seen": 20816, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7079646017699115, |
| "grad_norm": 60.51303482055664, |
| "learning_rate": 1.747787610619469e-05, |
| "loss": 0.5263, |
| "num_input_tokens_seen": 22432, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.7522123893805309, |
| "grad_norm": 42.464073181152344, |
| "learning_rate": 1.858407079646018e-05, |
| "loss": 0.6093, |
| "num_input_tokens_seen": 23568, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.7964601769911505, |
| "grad_norm": 73.96981811523438, |
| "learning_rate": 1.9690265486725665e-05, |
| "loss": 0.5014, |
| "num_input_tokens_seen": 25280, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.8407079646017699, |
| "grad_norm": 45.10161590576172, |
| "learning_rate": 2.079646017699115e-05, |
| "loss": 0.5975, |
| "num_input_tokens_seen": 26656, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.8849557522123894, |
| "grad_norm": 25.65376091003418, |
| "learning_rate": 2.190265486725664e-05, |
| "loss": 0.3769, |
| "num_input_tokens_seen": 28080, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.9292035398230089, |
| "grad_norm": 15.245662689208984, |
| "learning_rate": 2.3008849557522124e-05, |
| "loss": 0.4009, |
| "num_input_tokens_seen": 29344, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.9734513274336283, |
| "grad_norm": 8.494036674499512, |
| "learning_rate": 2.411504424778761e-05, |
| "loss": 0.187, |
| "num_input_tokens_seen": 30576, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.578876256942749, |
| "eval_runtime": 0.7017, |
| "eval_samples_per_second": 35.628, |
| "eval_steps_per_second": 18.526, |
| "num_input_tokens_seen": 31088, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.0176991150442478, |
| "grad_norm": 26.282270431518555, |
| "learning_rate": 2.5221238938053098e-05, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 31584, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.0619469026548674, |
| "grad_norm": 6.593599319458008, |
| "learning_rate": 2.6327433628318586e-05, |
| "loss": 0.4427, |
| "num_input_tokens_seen": 32688, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.1061946902654867, |
| "grad_norm": 53.921142578125, |
| "learning_rate": 2.743362831858407e-05, |
| "loss": 0.458, |
| "num_input_tokens_seen": 33872, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.1504424778761062, |
| "grad_norm": 6.744010925292969, |
| "learning_rate": 2.853982300884956e-05, |
| "loss": 0.1961, |
| "num_input_tokens_seen": 35136, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.1946902654867257, |
| "grad_norm": 4.191463947296143, |
| "learning_rate": 2.964601769911505e-05, |
| "loss": 0.6081, |
| "num_input_tokens_seen": 36448, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.238938053097345, |
| "grad_norm": 7.093989372253418, |
| "learning_rate": 3.075221238938053e-05, |
| "loss": 0.2536, |
| "num_input_tokens_seen": 37568, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.2831858407079646, |
| "grad_norm": 25.461423873901367, |
| "learning_rate": 3.185840707964602e-05, |
| "loss": 0.3962, |
| "num_input_tokens_seen": 38800, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.3274336283185841, |
| "grad_norm": 16.948087692260742, |
| "learning_rate": 3.296460176991151e-05, |
| "loss": 0.4931, |
| "num_input_tokens_seen": 39824, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.3716814159292037, |
| "grad_norm": 1.3893619775772095, |
| "learning_rate": 3.407079646017699e-05, |
| "loss": 0.1558, |
| "num_input_tokens_seen": 41312, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.415929203539823, |
| "grad_norm": 3.721031427383423, |
| "learning_rate": 3.517699115044248e-05, |
| "loss": 0.3323, |
| "num_input_tokens_seen": 42864, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.4601769911504425, |
| "grad_norm": 4.4327392578125, |
| "learning_rate": 3.628318584070797e-05, |
| "loss": 0.1381, |
| "num_input_tokens_seen": 44480, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.504424778761062, |
| "grad_norm": 7.111844539642334, |
| "learning_rate": 3.7389380530973455e-05, |
| "loss": 0.2045, |
| "num_input_tokens_seen": 45904, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.5486725663716814, |
| "grad_norm": 5.4775238037109375, |
| "learning_rate": 3.849557522123894e-05, |
| "loss": 0.4024, |
| "num_input_tokens_seen": 47728, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.592920353982301, |
| "grad_norm": 30.717697143554688, |
| "learning_rate": 3.9601769911504426e-05, |
| "loss": 0.3242, |
| "num_input_tokens_seen": 49072, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.6371681415929205, |
| "grad_norm": 0.723737895488739, |
| "learning_rate": 4.0707964601769914e-05, |
| "loss": 0.364, |
| "num_input_tokens_seen": 50224, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.6814159292035398, |
| "grad_norm": 13.76153564453125, |
| "learning_rate": 4.1814159292035396e-05, |
| "loss": 1.0525, |
| "num_input_tokens_seen": 51392, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.7256637168141593, |
| "grad_norm": 7.214544296264648, |
| "learning_rate": 4.2920353982300885e-05, |
| "loss": 0.3414, |
| "num_input_tokens_seen": 53040, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.7699115044247788, |
| "grad_norm": 3.2891359329223633, |
| "learning_rate": 4.4026548672566373e-05, |
| "loss": 0.368, |
| "num_input_tokens_seen": 54080, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.8141592920353982, |
| "grad_norm": 2.3762855529785156, |
| "learning_rate": 4.5132743362831855e-05, |
| "loss": 0.1567, |
| "num_input_tokens_seen": 55456, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.8584070796460177, |
| "grad_norm": 17.369836807250977, |
| "learning_rate": 4.6238938053097344e-05, |
| "loss": 0.4442, |
| "num_input_tokens_seen": 56928, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.9026548672566372, |
| "grad_norm": 9.339529991149902, |
| "learning_rate": 4.734513274336283e-05, |
| "loss": 0.3297, |
| "num_input_tokens_seen": 58672, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.9469026548672566, |
| "grad_norm": 11.139165878295898, |
| "learning_rate": 4.845132743362832e-05, |
| "loss": 0.3465, |
| "num_input_tokens_seen": 60064, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.991150442477876, |
| "grad_norm": 7.265506744384766, |
| "learning_rate": 4.955752212389381e-05, |
| "loss": 0.4145, |
| "num_input_tokens_seen": 61760, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.2958730459213257, |
| "eval_runtime": 0.7037, |
| "eval_samples_per_second": 35.524, |
| "eval_steps_per_second": 18.472, |
| "num_input_tokens_seen": 61872, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.0353982300884956, |
| "grad_norm": 3.572735548019409, |
| "learning_rate": 4.9999731620342936e-05, |
| "loss": 0.1649, |
| "num_input_tokens_seen": 63088, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.079646017699115, |
| "grad_norm": 7.921909809112549, |
| "learning_rate": 4.9998091543305845e-05, |
| "loss": 0.2503, |
| "num_input_tokens_seen": 64400, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.1238938053097347, |
| "grad_norm": 1.32038414478302, |
| "learning_rate": 4.999496058673635e-05, |
| "loss": 0.103, |
| "num_input_tokens_seen": 65904, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.168141592920354, |
| "grad_norm": 32.722721099853516, |
| "learning_rate": 4.999033893736386e-05, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 67184, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.2123893805309733, |
| "grad_norm": 21.232004165649414, |
| "learning_rate": 4.99842268708223e-05, |
| "loss": 0.7923, |
| "num_input_tokens_seen": 68528, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.256637168141593, |
| "grad_norm": 166.81187438964844, |
| "learning_rate": 4.9976624751633725e-05, |
| "loss": 0.5548, |
| "num_input_tokens_seen": 70016, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.3008849557522124, |
| "grad_norm": 3.3175036907196045, |
| "learning_rate": 4.996753303318648e-05, |
| "loss": 0.4315, |
| "num_input_tokens_seen": 71280, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.3451327433628317, |
| "grad_norm": 1.0060707330703735, |
| "learning_rate": 4.995695225770825e-05, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 72960, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.3893805309734515, |
| "grad_norm": 17.81039810180664, |
| "learning_rate": 4.994488305623365e-05, |
| "loss": 0.6303, |
| "num_input_tokens_seen": 74272, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.433628318584071, |
| "grad_norm": 6.303297996520996, |
| "learning_rate": 4.993132614856666e-05, |
| "loss": 0.3667, |
| "num_input_tokens_seen": 75520, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.47787610619469, |
| "grad_norm": 23.18317985534668, |
| "learning_rate": 4.991628234323765e-05, |
| "loss": 0.313, |
| "num_input_tokens_seen": 76672, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.52212389380531, |
| "grad_norm": 20.775409698486328, |
| "learning_rate": 4.9899752537455166e-05, |
| "loss": 0.517, |
| "num_input_tokens_seen": 78272, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.566371681415929, |
| "grad_norm": 4.6241135597229, |
| "learning_rate": 4.9881737717052436e-05, |
| "loss": 0.5348, |
| "num_input_tokens_seen": 79296, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.6106194690265485, |
| "grad_norm": 0.8703950047492981, |
| "learning_rate": 4.9862238956428556e-05, |
| "loss": 0.3327, |
| "num_input_tokens_seen": 80704, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.6548672566371683, |
| "grad_norm": 16.215003967285156, |
| "learning_rate": 4.984125741848441e-05, |
| "loss": 0.1693, |
| "num_input_tokens_seen": 82080, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.6991150442477876, |
| "grad_norm": 0.3222169280052185, |
| "learning_rate": 4.981879435455336e-05, |
| "loss": 0.2452, |
| "num_input_tokens_seen": 83520, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.7433628318584073, |
| "grad_norm": 18.398834228515625, |
| "learning_rate": 4.9794851104326554e-05, |
| "loss": 0.3755, |
| "num_input_tokens_seen": 84832, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.7876106194690267, |
| "grad_norm": 0.6229631900787354, |
| "learning_rate": 4.976942909577307e-05, |
| "loss": 0.0989, |
| "num_input_tokens_seen": 86144, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.831858407079646, |
| "grad_norm": 15.08667278289795, |
| "learning_rate": 4.974252984505475e-05, |
| "loss": 0.7358, |
| "num_input_tokens_seen": 87568, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.8761061946902657, |
| "grad_norm": 0.5263973474502563, |
| "learning_rate": 4.971415495643574e-05, |
| "loss": 0.1467, |
| "num_input_tokens_seen": 89024, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.920353982300885, |
| "grad_norm": 11.806825637817383, |
| "learning_rate": 4.968430612218687e-05, |
| "loss": 0.4301, |
| "num_input_tokens_seen": 90320, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.9646017699115044, |
| "grad_norm": 2.6548068523406982, |
| "learning_rate": 4.965298512248466e-05, |
| "loss": 0.1423, |
| "num_input_tokens_seen": 91952, |
| "step": 335 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.2716846764087677, |
| "eval_runtime": 0.7148, |
| "eval_samples_per_second": 34.974, |
| "eval_steps_per_second": 18.186, |
| "num_input_tokens_seen": 93016, |
| "step": 339 |
| }, |
| { |
| "epoch": 3.0088495575221237, |
| "grad_norm": 1.6614289283752441, |
| "learning_rate": 4.962019382530521e-05, |
| "loss": 0.4037, |
| "num_input_tokens_seen": 93496, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.0530973451327434, |
| "grad_norm": 0.5826793313026428, |
| "learning_rate": 4.958593418631275e-05, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 94984, |
| "step": 345 |
| }, |
| { |
| "epoch": 3.0973451327433628, |
| "grad_norm": 78.74436950683594, |
| "learning_rate": 4.955020824874307e-05, |
| "loss": 0.1109, |
| "num_input_tokens_seen": 96296, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.1415929203539825, |
| "grad_norm": 0.03470359742641449, |
| "learning_rate": 4.951301814328157e-05, |
| "loss": 0.6523, |
| "num_input_tokens_seen": 97528, |
| "step": 355 |
| }, |
| { |
| "epoch": 3.185840707964602, |
| "grad_norm": 24.015844345092773, |
| "learning_rate": 4.947436608793624e-05, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 98760, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.230088495575221, |
| "grad_norm": 0.31200626492500305, |
| "learning_rate": 4.9434254387905395e-05, |
| "loss": 0.7896, |
| "num_input_tokens_seen": 99864, |
| "step": 365 |
| }, |
| { |
| "epoch": 3.274336283185841, |
| "grad_norm": 4.601434230804443, |
| "learning_rate": 4.9392685435440154e-05, |
| "loss": 0.3967, |
| "num_input_tokens_seen": 101208, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.3185840707964602, |
| "grad_norm": 0.2778503894805908, |
| "learning_rate": 4.93496617097018e-05, |
| "loss": 0.2091, |
| "num_input_tokens_seen": 102520, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.3628318584070795, |
| "grad_norm": 1.937961459159851, |
| "learning_rate": 4.930518577661388e-05, |
| "loss": 0.632, |
| "num_input_tokens_seen": 104024, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.4070796460176993, |
| "grad_norm": 5.325284481048584, |
| "learning_rate": 4.925926028870923e-05, |
| "loss": 0.1242, |
| "num_input_tokens_seen": 105784, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.4513274336283186, |
| "grad_norm": 1.0098425149917603, |
| "learning_rate": 4.921188798497173e-05, |
| "loss": 0.3439, |
| "num_input_tokens_seen": 107000, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.495575221238938, |
| "grad_norm": 5.358858585357666, |
| "learning_rate": 4.9163071690672973e-05, |
| "loss": 0.3003, |
| "num_input_tokens_seen": 108536, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.5398230088495577, |
| "grad_norm": 3.297380208969116, |
| "learning_rate": 4.911281431720378e-05, |
| "loss": 0.1494, |
| "num_input_tokens_seen": 110088, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.584070796460177, |
| "grad_norm": 3.8313727378845215, |
| "learning_rate": 4.9061118861900537e-05, |
| "loss": 0.102, |
| "num_input_tokens_seen": 111752, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.6283185840707963, |
| "grad_norm": 1.057591199874878, |
| "learning_rate": 4.900798840786645e-05, |
| "loss": 0.2434, |
| "num_input_tokens_seen": 112808, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.672566371681416, |
| "grad_norm": 9.052189826965332, |
| "learning_rate": 4.8953426123787674e-05, |
| "loss": 0.2125, |
| "num_input_tokens_seen": 114264, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.7168141592920354, |
| "grad_norm": 9.058222770690918, |
| "learning_rate": 4.889743526374432e-05, |
| "loss": 0.2125, |
| "num_input_tokens_seen": 115608, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.7610619469026547, |
| "grad_norm": 19.98560333251953, |
| "learning_rate": 4.884001916701639e-05, |
| "loss": 0.4394, |
| "num_input_tokens_seen": 116888, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.8053097345132745, |
| "grad_norm": 1.8458290100097656, |
| "learning_rate": 4.878118125788462e-05, |
| "loss": 0.2994, |
| "num_input_tokens_seen": 118648, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.849557522123894, |
| "grad_norm": 4.5873637199401855, |
| "learning_rate": 4.872092504542629e-05, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 119864, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.893805309734513, |
| "grad_norm": 0.12213709205389023, |
| "learning_rate": 4.865925412330586e-05, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 121144, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.938053097345133, |
| "grad_norm": 2.7170772552490234, |
| "learning_rate": 4.859617216956074e-05, |
| "loss": 0.067, |
| "num_input_tokens_seen": 122504, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.982300884955752, |
| "grad_norm": 6.894057273864746, |
| "learning_rate": 4.8531682946381874e-05, |
| "loss": 0.3879, |
| "num_input_tokens_seen": 123784, |
| "step": 450 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.22076739370822906, |
| "eval_runtime": 0.7116, |
| "eval_samples_per_second": 35.13, |
| "eval_steps_per_second": 18.268, |
| "num_input_tokens_seen": 124056, |
| "step": 452 |
| }, |
| { |
| "epoch": 4.0265486725663715, |
| "grad_norm": 24.00897979736328, |
| "learning_rate": 4.846579029988939e-05, |
| "loss": 0.3232, |
| "num_input_tokens_seen": 124728, |
| "step": 455 |
| }, |
| { |
| "epoch": 4.070796460176991, |
| "grad_norm": 15.24329662322998, |
| "learning_rate": 4.8398498159903194e-05, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 126264, |
| "step": 460 |
| }, |
| { |
| "epoch": 4.115044247787611, |
| "grad_norm": 7.804415702819824, |
| "learning_rate": 4.8329810539708625e-05, |
| "loss": 0.371, |
| "num_input_tokens_seen": 127400, |
| "step": 465 |
| }, |
| { |
| "epoch": 4.15929203539823, |
| "grad_norm": 2.5687830448150635, |
| "learning_rate": 4.825973153581709e-05, |
| "loss": 0.1414, |
| "num_input_tokens_seen": 128808, |
| "step": 470 |
| }, |
| { |
| "epoch": 4.20353982300885, |
| "grad_norm": 0.7442347407341003, |
| "learning_rate": 4.818826532772174e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 130184, |
| "step": 475 |
| }, |
| { |
| "epoch": 4.247787610619469, |
| "grad_norm": 8.681638717651367, |
| "learning_rate": 4.8115416177648234e-05, |
| "loss": 0.1952, |
| "num_input_tokens_seen": 131352, |
| "step": 480 |
| }, |
| { |
| "epoch": 4.292035398230088, |
| "grad_norm": 5.167050838470459, |
| "learning_rate": 4.804118843030049e-05, |
| "loss": 0.3033, |
| "num_input_tokens_seen": 132632, |
| "step": 485 |
| }, |
| { |
| "epoch": 4.336283185840708, |
| "grad_norm": 10.922381401062012, |
| "learning_rate": 4.796558651260165e-05, |
| "loss": 0.3456, |
| "num_input_tokens_seen": 133816, |
| "step": 490 |
| }, |
| { |
| "epoch": 4.380530973451328, |
| "grad_norm": 0.27616623044013977, |
| "learning_rate": 4.7888614933429955e-05, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 135336, |
| "step": 495 |
| }, |
| { |
| "epoch": 4.424778761061947, |
| "grad_norm": 0.12749090790748596, |
| "learning_rate": 4.781027828334994e-05, |
| "loss": 0.4968, |
| "num_input_tokens_seen": 136504, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.469026548672566, |
| "grad_norm": 0.6915990710258484, |
| "learning_rate": 4.773058123433857e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 137656, |
| "step": 505 |
| }, |
| { |
| "epoch": 4.513274336283186, |
| "grad_norm": 7.805994510650635, |
| "learning_rate": 4.7649528539506673e-05, |
| "loss": 0.4895, |
| "num_input_tokens_seen": 138776, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.557522123893805, |
| "grad_norm": 2.6048264503479004, |
| "learning_rate": 4.7567125032815394e-05, |
| "loss": 0.222, |
| "num_input_tokens_seen": 140296, |
| "step": 515 |
| }, |
| { |
| "epoch": 4.601769911504425, |
| "grad_norm": 9.334881782531738, |
| "learning_rate": 4.7483375628787975e-05, |
| "loss": 0.0797, |
| "num_input_tokens_seen": 141720, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.646017699115045, |
| "grad_norm": 0.3281296193599701, |
| "learning_rate": 4.739828532221661e-05, |
| "loss": 0.3392, |
| "num_input_tokens_seen": 143336, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.6902654867256635, |
| "grad_norm": 5.273711681365967, |
| "learning_rate": 4.731185918786453e-05, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 145016, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.734513274336283, |
| "grad_norm": 1.681021809577942, |
| "learning_rate": 4.722410238016343e-05, |
| "loss": 0.2626, |
| "num_input_tokens_seen": 146296, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.778761061946903, |
| "grad_norm": 7.981612205505371, |
| "learning_rate": 4.7135020132905985e-05, |
| "loss": 0.0767, |
| "num_input_tokens_seen": 147656, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.823008849557522, |
| "grad_norm": 7.483859539031982, |
| "learning_rate": 4.7044617758933714e-05, |
| "loss": 0.3308, |
| "num_input_tokens_seen": 148808, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.867256637168142, |
| "grad_norm": 0.7609708905220032, |
| "learning_rate": 4.695290064982018e-05, |
| "loss": 0.0535, |
| "num_input_tokens_seen": 150680, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.911504424778761, |
| "grad_norm": 0.046275313943624496, |
| "learning_rate": 4.6859874275549376e-05, |
| "loss": 0.0784, |
| "num_input_tokens_seen": 152392, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.95575221238938, |
| "grad_norm": 0.042602889239788055, |
| "learning_rate": 4.676554418418953e-05, |
| "loss": 0.2063, |
| "num_input_tokens_seen": 153896, |
| "step": 560 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 14.422776222229004, |
| "learning_rate": 4.66699160015622e-05, |
| "loss": 0.3181, |
| "num_input_tokens_seen": 155240, |
| "step": 565 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.1914851814508438, |
| "eval_runtime": 0.706, |
| "eval_samples_per_second": 35.408, |
| "eval_steps_per_second": 18.412, |
| "num_input_tokens_seen": 155240, |
| "step": 565 |
| }, |
| { |
| "epoch": 5.04424778761062, |
| "grad_norm": 7.451063632965088, |
| "learning_rate": 4.6572995430906784e-05, |
| "loss": 0.096, |
| "num_input_tokens_seen": 156296, |
| "step": 570 |
| }, |
| { |
| "epoch": 5.088495575221239, |
| "grad_norm": 0.47855445742607117, |
| "learning_rate": 4.6474788252540323e-05, |
| "loss": 0.0404, |
| "num_input_tokens_seen": 157688, |
| "step": 575 |
| }, |
| { |
| "epoch": 5.132743362831858, |
| "grad_norm": 2.1776716709136963, |
| "learning_rate": 4.637530032351284e-05, |
| "loss": 0.1061, |
| "num_input_tokens_seen": 159016, |
| "step": 580 |
| }, |
| { |
| "epoch": 5.176991150442478, |
| "grad_norm": 0.6025148630142212, |
| "learning_rate": 4.627453757725796e-05, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 160552, |
| "step": 585 |
| }, |
| { |
| "epoch": 5.221238938053097, |
| "grad_norm": 4.6014885902404785, |
| "learning_rate": 4.617250602323907e-05, |
| "loss": 0.3572, |
| "num_input_tokens_seen": 161912, |
| "step": 590 |
| }, |
| { |
| "epoch": 5.265486725663717, |
| "grad_norm": 14.82335376739502, |
| "learning_rate": 4.6069211746590926e-05, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 163288, |
| "step": 595 |
| }, |
| { |
| "epoch": 5.3097345132743365, |
| "grad_norm": 13.218178749084473, |
| "learning_rate": 4.596466090775672e-05, |
| "loss": 0.057, |
| "num_input_tokens_seen": 164920, |
| "step": 600 |
| }, |
| { |
| "epoch": 5.353982300884955, |
| "grad_norm": 0.04223296418786049, |
| "learning_rate": 4.585885974212068e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 166296, |
| "step": 605 |
| }, |
| { |
| "epoch": 5.398230088495575, |
| "grad_norm": 0.11065981537103653, |
| "learning_rate": 4.575181455963619e-05, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 167720, |
| "step": 610 |
| }, |
| { |
| "epoch": 5.442477876106195, |
| "grad_norm": 0.5655098557472229, |
| "learning_rate": 4.5643531744449474e-05, |
| "loss": 0.1083, |
| "num_input_tokens_seen": 169480, |
| "step": 615 |
| }, |
| { |
| "epoch": 5.486725663716814, |
| "grad_norm": 0.04772956669330597, |
| "learning_rate": 4.553401775451882e-05, |
| "loss": 0.3601, |
| "num_input_tokens_seen": 171032, |
| "step": 620 |
| }, |
| { |
| "epoch": 5.530973451327434, |
| "grad_norm": 6.715683460235596, |
| "learning_rate": 4.542327912122949e-05, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 172072, |
| "step": 625 |
| }, |
| { |
| "epoch": 5.575221238938053, |
| "grad_norm": 22.586627960205078, |
| "learning_rate": 4.531132244900411e-05, |
| "loss": 0.2021, |
| "num_input_tokens_seen": 173368, |
| "step": 630 |
| }, |
| { |
| "epoch": 5.619469026548672, |
| "grad_norm": 13.52296257019043, |
| "learning_rate": 4.519815441490884e-05, |
| "loss": 0.6313, |
| "num_input_tokens_seen": 174520, |
| "step": 635 |
| }, |
| { |
| "epoch": 5.663716814159292, |
| "grad_norm": 0.3611186444759369, |
| "learning_rate": 4.508378176825516e-05, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 175976, |
| "step": 640 |
| }, |
| { |
| "epoch": 5.707964601769912, |
| "grad_norm": 0.21410831809043884, |
| "learning_rate": 4.496821133019728e-05, |
| "loss": 0.3492, |
| "num_input_tokens_seen": 177240, |
| "step": 645 |
| }, |
| { |
| "epoch": 5.752212389380531, |
| "grad_norm": 2.9298503398895264, |
| "learning_rate": 4.485144999332541e-05, |
| "loss": 0.2736, |
| "num_input_tokens_seen": 178808, |
| "step": 650 |
| }, |
| { |
| "epoch": 5.79646017699115, |
| "grad_norm": 0.7128796577453613, |
| "learning_rate": 4.4733504721254625e-05, |
| "loss": 0.2834, |
| "num_input_tokens_seen": 180024, |
| "step": 655 |
| }, |
| { |
| "epoch": 5.84070796460177, |
| "grad_norm": 1.0807809829711914, |
| "learning_rate": 4.461438254820959e-05, |
| "loss": 0.0907, |
| "num_input_tokens_seen": 181368, |
| "step": 660 |
| }, |
| { |
| "epoch": 5.88495575221239, |
| "grad_norm": 0.01651749014854431, |
| "learning_rate": 4.449409057860504e-05, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 182696, |
| "step": 665 |
| }, |
| { |
| "epoch": 5.929203539823009, |
| "grad_norm": 0.07463294267654419, |
| "learning_rate": 4.4372635986622044e-05, |
| "loss": 0.0969, |
| "num_input_tokens_seen": 184232, |
| "step": 670 |
| }, |
| { |
| "epoch": 5.9734513274336285, |
| "grad_norm": 12.448905944824219, |
| "learning_rate": 4.425002601578017e-05, |
| "loss": 0.3964, |
| "num_input_tokens_seen": 185496, |
| "step": 675 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.25048157572746277, |
| "eval_runtime": 0.7064, |
| "eval_samples_per_second": 35.392, |
| "eval_steps_per_second": 18.404, |
| "num_input_tokens_seen": 185984, |
| "step": 678 |
| }, |
| { |
| "epoch": 6.017699115044247, |
| "grad_norm": 0.009647250175476074, |
| "learning_rate": 4.4126267978505486e-05, |
| "loss": 0.0854, |
| "num_input_tokens_seen": 186464, |
| "step": 680 |
| }, |
| { |
| "epoch": 6.061946902654867, |
| "grad_norm": 0.5592358112335205, |
| "learning_rate": 4.4001369255694416e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 187936, |
| "step": 685 |
| }, |
| { |
| "epoch": 6.106194690265487, |
| "grad_norm": 0.055525992065668106, |
| "learning_rate": 4.387533729627359e-05, |
| "loss": 0.1246, |
| "num_input_tokens_seen": 189600, |
| "step": 690 |
| }, |
| { |
| "epoch": 6.150442477876107, |
| "grad_norm": 11.364505767822266, |
| "learning_rate": 4.374817961675553e-05, |
| "loss": 0.0787, |
| "num_input_tokens_seen": 190896, |
| "step": 695 |
| }, |
| { |
| "epoch": 6.1946902654867255, |
| "grad_norm": 13.535588264465332, |
| "learning_rate": 4.3619903800790465e-05, |
| "loss": 0.2709, |
| "num_input_tokens_seen": 192496, |
| "step": 700 |
| }, |
| { |
| "epoch": 6.238938053097345, |
| "grad_norm": 0.50110924243927, |
| "learning_rate": 4.3490517498713924e-05, |
| "loss": 0.0673, |
| "num_input_tokens_seen": 193968, |
| "step": 705 |
| }, |
| { |
| "epoch": 6.283185840707965, |
| "grad_norm": 1.2037434577941895, |
| "learning_rate": 4.336002842709057e-05, |
| "loss": 0.1423, |
| "num_input_tokens_seen": 195232, |
| "step": 710 |
| }, |
| { |
| "epoch": 6.327433628318584, |
| "grad_norm": 16.416555404663086, |
| "learning_rate": 4.3228444368253925e-05, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 196880, |
| "step": 715 |
| }, |
| { |
| "epoch": 6.371681415929204, |
| "grad_norm": 0.02976650558412075, |
| "learning_rate": 4.309577316984228e-05, |
| "loss": 0.2113, |
| "num_input_tokens_seen": 198336, |
| "step": 720 |
| }, |
| { |
| "epoch": 6.415929203539823, |
| "grad_norm": 2.008106231689453, |
| "learning_rate": 4.2962022744330616e-05, |
| "loss": 0.0867, |
| "num_input_tokens_seen": 199632, |
| "step": 725 |
| }, |
| { |
| "epoch": 6.460176991150442, |
| "grad_norm": 12.198774337768555, |
| "learning_rate": 4.282720106855876e-05, |
| "loss": 0.1378, |
| "num_input_tokens_seen": 200944, |
| "step": 730 |
| }, |
| { |
| "epoch": 6.504424778761062, |
| "grad_norm": 2.4009339809417725, |
| "learning_rate": 4.269131618325559e-05, |
| "loss": 0.0768, |
| "num_input_tokens_seen": 202448, |
| "step": 735 |
| }, |
| { |
| "epoch": 6.548672566371682, |
| "grad_norm": 0.18601976335048676, |
| "learning_rate": 4.255437619255955e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 203952, |
| "step": 740 |
| }, |
| { |
| "epoch": 6.592920353982301, |
| "grad_norm": 0.03302191570401192, |
| "learning_rate": 4.241638926353526e-05, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 205104, |
| "step": 745 |
| }, |
| { |
| "epoch": 6.6371681415929205, |
| "grad_norm": 3.894731283187866, |
| "learning_rate": 4.2277363625686475e-05, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 206656, |
| "step": 750 |
| }, |
| { |
| "epoch": 6.68141592920354, |
| "grad_norm": 0.010169882327318192, |
| "learning_rate": 4.213730757046528e-05, |
| "loss": 0.2439, |
| "num_input_tokens_seen": 207648, |
| "step": 755 |
| }, |
| { |
| "epoch": 6.725663716814159, |
| "grad_norm": 0.009855731390416622, |
| "learning_rate": 4.199622945077755e-05, |
| "loss": 0.1697, |
| "num_input_tokens_seen": 208928, |
| "step": 760 |
| }, |
| { |
| "epoch": 6.769911504424779, |
| "grad_norm": 9.455737113952637, |
| "learning_rate": 4.185413768048483e-05, |
| "loss": 0.6775, |
| "num_input_tokens_seen": 210432, |
| "step": 765 |
| }, |
| { |
| "epoch": 6.814159292035399, |
| "grad_norm": 12.269086837768555, |
| "learning_rate": 4.1711040733902526e-05, |
| "loss": 0.2883, |
| "num_input_tokens_seen": 211680, |
| "step": 770 |
| }, |
| { |
| "epoch": 6.8584070796460175, |
| "grad_norm": 2.7802438735961914, |
| "learning_rate": 4.1566947145294474e-05, |
| "loss": 0.0973, |
| "num_input_tokens_seen": 212832, |
| "step": 775 |
| }, |
| { |
| "epoch": 6.902654867256637, |
| "grad_norm": 0.4753245413303375, |
| "learning_rate": 4.142186550836399e-05, |
| "loss": 0.0864, |
| "num_input_tokens_seen": 214144, |
| "step": 780 |
| }, |
| { |
| "epoch": 6.946902654867257, |
| "grad_norm": 11.31082820892334, |
| "learning_rate": 4.127580447574131e-05, |
| "loss": 0.1053, |
| "num_input_tokens_seen": 215440, |
| "step": 785 |
| }, |
| { |
| "epoch": 6.991150442477876, |
| "grad_norm": 0.08500145375728607, |
| "learning_rate": 4.1128772758467604e-05, |
| "loss": 0.0106, |
| "num_input_tokens_seen": 217088, |
| "step": 790 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.30520543456077576, |
| "eval_runtime": 0.7215, |
| "eval_samples_per_second": 34.648, |
| "eval_steps_per_second": 18.017, |
| "num_input_tokens_seen": 217192, |
| "step": 791 |
| }, |
| { |
| "epoch": 7.035398230088496, |
| "grad_norm": 13.591657638549805, |
| "learning_rate": 4.098077912547536e-05, |
| "loss": 0.1356, |
| "num_input_tokens_seen": 218712, |
| "step": 795 |
| }, |
| { |
| "epoch": 7.079646017699115, |
| "grad_norm": 0.01622081734240055, |
| "learning_rate": 4.0831832403065526e-05, |
| "loss": 0.2267, |
| "num_input_tokens_seen": 219944, |
| "step": 800 |
| }, |
| { |
| "epoch": 7.123893805309734, |
| "grad_norm": 0.3760717213153839, |
| "learning_rate": 4.068194147438101e-05, |
| "loss": 0.0358, |
| "num_input_tokens_seen": 221304, |
| "step": 805 |
| }, |
| { |
| "epoch": 7.168141592920354, |
| "grad_norm": 1.4863288402557373, |
| "learning_rate": 4.0531115278876934e-05, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 222680, |
| "step": 810 |
| }, |
| { |
| "epoch": 7.212389380530974, |
| "grad_norm": 1.766863226890564, |
| "learning_rate": 4.0379362811787504e-05, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 224120, |
| "step": 815 |
| }, |
| { |
| "epoch": 7.256637168141593, |
| "grad_norm": 0.3708575367927551, |
| "learning_rate": 4.022669312358949e-05, |
| "loss": 0.0023, |
| "num_input_tokens_seen": 225448, |
| "step": 820 |
| }, |
| { |
| "epoch": 7.300884955752212, |
| "grad_norm": 19.3380069732666, |
| "learning_rate": 4.007311531946252e-05, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 226840, |
| "step": 825 |
| }, |
| { |
| "epoch": 7.345132743362832, |
| "grad_norm": 0.014043072238564491, |
| "learning_rate": 3.9918638558745966e-05, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 227928, |
| "step": 830 |
| }, |
| { |
| "epoch": 7.389380530973451, |
| "grad_norm": 0.017731424421072006, |
| "learning_rate": 3.976327205439279e-05, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 229336, |
| "step": 835 |
| }, |
| { |
| "epoch": 7.433628318584071, |
| "grad_norm": 0.02818802371621132, |
| "learning_rate": 3.9607025072419986e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 231016, |
| "step": 840 |
| }, |
| { |
| "epoch": 7.477876106194691, |
| "grad_norm": 0.005128256976604462, |
| "learning_rate": 3.9449906931356005e-05, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 232296, |
| "step": 845 |
| }, |
| { |
| "epoch": 7.522123893805309, |
| "grad_norm": 0.022384563460946083, |
| "learning_rate": 3.929192700168501e-05, |
| "loss": 0.0119, |
| "num_input_tokens_seen": 233432, |
| "step": 850 |
| }, |
| { |
| "epoch": 7.566371681415929, |
| "grad_norm": 0.005155643448233604, |
| "learning_rate": 3.9133094705287984e-05, |
| "loss": 0.0953, |
| "num_input_tokens_seen": 235224, |
| "step": 855 |
| }, |
| { |
| "epoch": 7.610619469026549, |
| "grad_norm": 19.11586570739746, |
| "learning_rate": 3.897341951488087e-05, |
| "loss": 0.1108, |
| "num_input_tokens_seen": 236456, |
| "step": 860 |
| }, |
| { |
| "epoch": 7.654867256637168, |
| "grad_norm": 0.005490736570209265, |
| "learning_rate": 3.8812910953449555e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 237912, |
| "step": 865 |
| }, |
| { |
| "epoch": 7.699115044247788, |
| "grad_norm": 0.006044618785381317, |
| "learning_rate": 3.865157859368196e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 239304, |
| "step": 870 |
| }, |
| { |
| "epoch": 7.743362831858407, |
| "grad_norm": 16.018875122070312, |
| "learning_rate": 3.848943205739711e-05, |
| "loss": 0.072, |
| "num_input_tokens_seen": 240616, |
| "step": 875 |
| }, |
| { |
| "epoch": 7.787610619469026, |
| "grad_norm": 0.09485733509063721, |
| "learning_rate": 3.832648101497134e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 241928, |
| "step": 880 |
| }, |
| { |
| "epoch": 7.831858407079646, |
| "grad_norm": 0.004135196562856436, |
| "learning_rate": 3.8162735184761476e-05, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 243432, |
| "step": 885 |
| }, |
| { |
| "epoch": 7.876106194690266, |
| "grad_norm": 0.3384363055229187, |
| "learning_rate": 3.799820433252529e-05, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 244904, |
| "step": 890 |
| }, |
| { |
| "epoch": 7.920353982300885, |
| "grad_norm": 6.4393744468688965, |
| "learning_rate": 3.783289827083905e-05, |
| "loss": 0.0952, |
| "num_input_tokens_seen": 246152, |
| "step": 895 |
| }, |
| { |
| "epoch": 7.964601769911504, |
| "grad_norm": 0.11237289011478424, |
| "learning_rate": 3.766682685851234e-05, |
| "loss": 0.1278, |
| "num_input_tokens_seen": 247320, |
| "step": 900 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.23238396644592285, |
| "eval_runtime": 0.7073, |
| "eval_samples_per_second": 35.344, |
| "eval_steps_per_second": 18.379, |
| "num_input_tokens_seen": 248456, |
| "step": 904 |
| }, |
| { |
| "epoch": 8.008849557522124, |
| "grad_norm": 0.0031759459525346756, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 248808, |
| "step": 905 |
| }, |
| { |
| "epoch": 8.053097345132743, |
| "grad_norm": 0.007015054579824209, |
| "learning_rate": 3.733242764481154e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 250056, |
| "step": 910 |
| }, |
| { |
| "epoch": 8.097345132743364, |
| "grad_norm": 0.002347296569496393, |
| "learning_rate": 3.716411978691766e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 251560, |
| "step": 915 |
| }, |
| { |
| "epoch": 8.141592920353983, |
| "grad_norm": 0.04092458263039589, |
| "learning_rate": 3.699508646415424e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 252808, |
| "step": 920 |
| }, |
| { |
| "epoch": 8.185840707964601, |
| "grad_norm": 0.0027613777201622725, |
| "learning_rate": 3.6825337757623696e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 254536, |
| "step": 925 |
| }, |
| { |
| "epoch": 8.230088495575222, |
| "grad_norm": 29.523372650146484, |
| "learning_rate": 3.665488379109377e-05, |
| "loss": 0.13, |
| "num_input_tokens_seen": 256136, |
| "step": 930 |
| }, |
| { |
| "epoch": 8.274336283185841, |
| "grad_norm": 0.0084888506680727, |
| "learning_rate": 3.648373473039368e-05, |
| "loss": 0.1807, |
| "num_input_tokens_seen": 257320, |
| "step": 935 |
| }, |
| { |
| "epoch": 8.31858407079646, |
| "grad_norm": 0.9264829754829407, |
| "learning_rate": 3.631190078280791e-05, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 258920, |
| "step": 940 |
| }, |
| { |
| "epoch": 8.36283185840708, |
| "grad_norm": 0.0780593603849411, |
| "learning_rate": 3.613939219646739e-05, |
| "loss": 0.002, |
| "num_input_tokens_seen": 260216, |
| "step": 945 |
| }, |
| { |
| "epoch": 8.4070796460177, |
| "grad_norm": 0.010497448965907097, |
| "learning_rate": 3.596621925973835e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 261736, |
| "step": 950 |
| }, |
| { |
| "epoch": 8.451327433628318, |
| "grad_norm": 0.0034908235538750887, |
| "learning_rate": 3.579239230060867e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 262920, |
| "step": 955 |
| }, |
| { |
| "epoch": 8.495575221238939, |
| "grad_norm": 0.06873831152915955, |
| "learning_rate": 3.5617921686071995e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 264216, |
| "step": 960 |
| }, |
| { |
| "epoch": 8.539823008849558, |
| "grad_norm": 0.0016238873358815908, |
| "learning_rate": 3.544281782150936e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 265320, |
| "step": 965 |
| }, |
| { |
| "epoch": 8.584070796460177, |
| "grad_norm": 0.0033213901333510876, |
| "learning_rate": 3.526709115006871e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 266776, |
| "step": 970 |
| }, |
| { |
| "epoch": 8.628318584070797, |
| "grad_norm": 0.14736580848693848, |
| "learning_rate": 3.5090752152041975e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 268296, |
| "step": 975 |
| }, |
| { |
| "epoch": 8.672566371681416, |
| "grad_norm": 0.004754791967570782, |
| "learning_rate": 3.491381134424012e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 269528, |
| "step": 980 |
| }, |
| { |
| "epoch": 8.716814159292035, |
| "grad_norm": 0.10114692151546478, |
| "learning_rate": 3.4736279279365876e-05, |
| "loss": 0.002, |
| "num_input_tokens_seen": 270808, |
| "step": 985 |
| }, |
| { |
| "epoch": 8.761061946902656, |
| "grad_norm": 0.014966151677072048, |
| "learning_rate": 3.455816654538438e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 271992, |
| "step": 990 |
| }, |
| { |
| "epoch": 8.805309734513274, |
| "grad_norm": 0.005408014636486769, |
| "learning_rate": 3.437948376489172e-05, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 273656, |
| "step": 995 |
| }, |
| { |
| "epoch": 8.849557522123893, |
| "grad_norm": 0.004298860672861338, |
| "learning_rate": 3.420024159448142e-05, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 275352, |
| "step": 1000 |
| }, |
| { |
| "epoch": 8.893805309734514, |
| "grad_norm": 0.01132711861282587, |
| "learning_rate": 3.402045072410886e-05, |
| "loss": 0.0057, |
| "num_input_tokens_seen": 276472, |
| "step": 1005 |
| }, |
| { |
| "epoch": 8.938053097345133, |
| "grad_norm": 0.008406072854995728, |
| "learning_rate": 3.3840121876453734e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 277832, |
| "step": 1010 |
| }, |
| { |
| "epoch": 8.982300884955752, |
| "grad_norm": 0.904096245765686, |
| "learning_rate": 3.365926580628057e-05, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 279352, |
| "step": 1015 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.6020130515098572, |
| "eval_runtime": 0.7062, |
| "eval_samples_per_second": 35.399, |
| "eval_steps_per_second": 18.408, |
| "num_input_tokens_seen": 279744, |
| "step": 1017 |
| }, |
| { |
| "epoch": 9.026548672566372, |
| "grad_norm": 1.5356918573379517, |
| "learning_rate": 3.3477893299797304e-05, |
| "loss": 0.016, |
| "num_input_tokens_seen": 280608, |
| "step": 1020 |
| }, |
| { |
| "epoch": 9.070796460176991, |
| "grad_norm": 0.007247226778417826, |
| "learning_rate": 3.3296015174011984e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 281872, |
| "step": 1025 |
| }, |
| { |
| "epoch": 9.11504424778761, |
| "grad_norm": 0.005958537571132183, |
| "learning_rate": 3.311364227608768e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 283472, |
| "step": 1030 |
| }, |
| { |
| "epoch": 9.15929203539823, |
| "grad_norm": 0.010532599873840809, |
| "learning_rate": 3.293078548269553e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 284992, |
| "step": 1035 |
| }, |
| { |
| "epoch": 9.20353982300885, |
| "grad_norm": 0.0058457087725400925, |
| "learning_rate": 3.2747455699366056e-05, |
| "loss": 0.006, |
| "num_input_tokens_seen": 286464, |
| "step": 1040 |
| }, |
| { |
| "epoch": 9.247787610619469, |
| "grad_norm": 1.6953773498535156, |
| "learning_rate": 3.256366385983879e-05, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 287792, |
| "step": 1045 |
| }, |
| { |
| "epoch": 9.29203539823009, |
| "grad_norm": 0.03676780313253403, |
| "learning_rate": 3.237942092541018e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 288960, |
| "step": 1050 |
| }, |
| { |
| "epoch": 9.336283185840708, |
| "grad_norm": 0.44130468368530273, |
| "learning_rate": 3.219473788427984e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 290336, |
| "step": 1055 |
| }, |
| { |
| "epoch": 9.380530973451327, |
| "grad_norm": 0.006891911383718252, |
| "learning_rate": 3.2009625750895224e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 291728, |
| "step": 1060 |
| }, |
| { |
| "epoch": 9.424778761061948, |
| "grad_norm": 0.36952126026153564, |
| "learning_rate": 3.182409556529476e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 292944, |
| "step": 1065 |
| }, |
| { |
| "epoch": 9.469026548672566, |
| "grad_norm": 0.008110780268907547, |
| "learning_rate": 3.163815839244937e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 294448, |
| "step": 1070 |
| }, |
| { |
| "epoch": 9.513274336283185, |
| "grad_norm": 0.0020900957752019167, |
| "learning_rate": 3.14518253216026e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 295744, |
| "step": 1075 |
| }, |
| { |
| "epoch": 9.557522123893806, |
| "grad_norm": 0.07697153836488724, |
| "learning_rate": 3.126510746560925e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 297280, |
| "step": 1080 |
| }, |
| { |
| "epoch": 9.601769911504425, |
| "grad_norm": 0.34611234068870544, |
| "learning_rate": 3.107801596027261e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 298624, |
| "step": 1085 |
| }, |
| { |
| "epoch": 9.646017699115044, |
| "grad_norm": 0.02981937676668167, |
| "learning_rate": 3.0890561963680306e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 299712, |
| "step": 1090 |
| }, |
| { |
| "epoch": 9.690265486725664, |
| "grad_norm": 0.006153137888759375, |
| "learning_rate": 3.0702756655538835e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 301152, |
| "step": 1095 |
| }, |
| { |
| "epoch": 9.734513274336283, |
| "grad_norm": 0.0012898497516289353, |
| "learning_rate": 3.051461123650685e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 302768, |
| "step": 1100 |
| }, |
| { |
| "epoch": 9.778761061946902, |
| "grad_norm": 0.002362328115850687, |
| "learning_rate": 3.032613692752711e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 304176, |
| "step": 1105 |
| }, |
| { |
| "epoch": 9.823008849557523, |
| "grad_norm": 0.0011592184891924262, |
| "learning_rate": 3.0137344969157284e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 305328, |
| "step": 1110 |
| }, |
| { |
| "epoch": 9.867256637168142, |
| "grad_norm": 0.0013819560408592224, |
| "learning_rate": 2.9948246620899557e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 306672, |
| "step": 1115 |
| }, |
| { |
| "epoch": 9.91150442477876, |
| "grad_norm": 0.006325939670205116, |
| "learning_rate": 2.9758853160529148e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 307920, |
| "step": 1120 |
| }, |
| { |
| "epoch": 9.955752212389381, |
| "grad_norm": 0.0011113256914541125, |
| "learning_rate": 2.9569175883421672e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 309424, |
| "step": 1125 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.0014623363967984915, |
| "learning_rate": 2.93792261018795e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 310888, |
| "step": 1130 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.34932976961135864, |
| "eval_runtime": 0.7139, |
| "eval_samples_per_second": 35.017, |
| "eval_steps_per_second": 18.209, |
| "num_input_tokens_seen": 310888, |
| "step": 1130 |
| }, |
| { |
| "epoch": 10.044247787610619, |
| "grad_norm": 0.02761891670525074, |
| "learning_rate": 2.9189015144457087e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 312536, |
| "step": 1135 |
| }, |
| { |
| "epoch": 10.08849557522124, |
| "grad_norm": 0.0019943041261285543, |
| "learning_rate": 2.8998554355285355e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 314376, |
| "step": 1140 |
| }, |
| { |
| "epoch": 10.132743362831858, |
| "grad_norm": 0.014421291649341583, |
| "learning_rate": 2.8807855093395126e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 315752, |
| "step": 1145 |
| }, |
| { |
| "epoch": 10.176991150442477, |
| "grad_norm": 0.000781937618739903, |
| "learning_rate": 2.8616928732039684e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 317112, |
| "step": 1150 |
| }, |
| { |
| "epoch": 10.221238938053098, |
| "grad_norm": 0.006714432965964079, |
| "learning_rate": 2.8425786658016423e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 318376, |
| "step": 1155 |
| }, |
| { |
| "epoch": 10.265486725663717, |
| "grad_norm": 0.0016940284986048937, |
| "learning_rate": 2.8234440270987837e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 319624, |
| "step": 1160 |
| }, |
| { |
| "epoch": 10.309734513274336, |
| "grad_norm": 0.0008305677329190075, |
| "learning_rate": 2.804290098280155e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 321176, |
| "step": 1165 |
| }, |
| { |
| "epoch": 10.353982300884956, |
| "grad_norm": 0.010752512142062187, |
| "learning_rate": 2.7851180216809796e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 322456, |
| "step": 1170 |
| }, |
| { |
| "epoch": 10.398230088495575, |
| "grad_norm": 0.05166340991854668, |
| "learning_rate": 2.765928940718806e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 323832, |
| "step": 1175 |
| }, |
| { |
| "epoch": 10.442477876106194, |
| "grad_norm": 0.0017710948595777154, |
| "learning_rate": 2.7467239998253214e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 325576, |
| "step": 1180 |
| }, |
| { |
| "epoch": 10.486725663716815, |
| "grad_norm": 0.0008418748620897532, |
| "learning_rate": 2.7275043443780934e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 326888, |
| "step": 1185 |
| }, |
| { |
| "epoch": 10.530973451327434, |
| "grad_norm": 0.03435960412025452, |
| "learning_rate": 2.708271120632262e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 328296, |
| "step": 1190 |
| }, |
| { |
| "epoch": 10.575221238938052, |
| "grad_norm": 0.001704709604382515, |
| "learning_rate": 2.6890254756521778e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 329560, |
| "step": 1195 |
| }, |
| { |
| "epoch": 10.619469026548673, |
| "grad_norm": 0.0007327900966629386, |
| "learning_rate": 2.6697685572429886e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 330696, |
| "step": 1200 |
| }, |
| { |
| "epoch": 10.663716814159292, |
| "grad_norm": 0.0013474611332640052, |
| "learning_rate": 2.65050151388219e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 332024, |
| "step": 1205 |
| }, |
| { |
| "epoch": 10.70796460176991, |
| "grad_norm": 0.0006612780271098018, |
| "learning_rate": 2.6312254946511217e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 333496, |
| "step": 1210 |
| }, |
| { |
| "epoch": 10.752212389380531, |
| "grad_norm": 0.008827758021652699, |
| "learning_rate": 2.6119416491664472e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 334680, |
| "step": 1215 |
| }, |
| { |
| "epoch": 10.79646017699115, |
| "grad_norm": 0.0013222714187577367, |
| "learning_rate": 2.5926511275115827e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 336264, |
| "step": 1220 |
| }, |
| { |
| "epoch": 10.84070796460177, |
| "grad_norm": 0.0009670580620877445, |
| "learning_rate": 2.57335508016811e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 337448, |
| "step": 1225 |
| }, |
| { |
| "epoch": 10.88495575221239, |
| "grad_norm": 0.00271513219922781, |
| "learning_rate": 2.5540546579471624e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 338664, |
| "step": 1230 |
| }, |
| { |
| "epoch": 10.929203539823009, |
| "grad_norm": 0.0006945921923033893, |
| "learning_rate": 2.5347510119207878e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 339976, |
| "step": 1235 |
| }, |
| { |
| "epoch": 10.973451327433628, |
| "grad_norm": 0.006609235890209675, |
| "learning_rate": 2.515445293353304e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 341048, |
| "step": 1240 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.375334233045578, |
| "eval_runtime": 0.7083, |
| "eval_samples_per_second": 35.297, |
| "eval_steps_per_second": 18.354, |
| "num_input_tokens_seen": 341832, |
| "step": 1243 |
| }, |
| { |
| "epoch": 11.017699115044248, |
| "grad_norm": 0.002934755990281701, |
| "learning_rate": 2.4961386536326307e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 342296, |
| "step": 1245 |
| }, |
| { |
| "epoch": 11.061946902654867, |
| "grad_norm": 0.0018877610564231873, |
| "learning_rate": 2.4768322442016278e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 343592, |
| "step": 1250 |
| }, |
| { |
| "epoch": 11.106194690265486, |
| "grad_norm": 0.0224758367985487, |
| "learning_rate": 2.457527216489421e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 344648, |
| "step": 1255 |
| }, |
| { |
| "epoch": 11.150442477876107, |
| "grad_norm": 0.0008520939154550433, |
| "learning_rate": 2.438224721842728e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 345992, |
| "step": 1260 |
| }, |
| { |
| "epoch": 11.194690265486726, |
| "grad_norm": 0.0008513452485203743, |
| "learning_rate": 2.4189259114571984e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 347176, |
| "step": 1265 |
| }, |
| { |
| "epoch": 11.238938053097344, |
| "grad_norm": 0.0007206813897937536, |
| "learning_rate": 2.39963193630875e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 348552, |
| "step": 1270 |
| }, |
| { |
| "epoch": 11.283185840707965, |
| "grad_norm": 0.000876375415828079, |
| "learning_rate": 2.3803439470849335e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 349992, |
| "step": 1275 |
| }, |
| { |
| "epoch": 11.327433628318584, |
| "grad_norm": 0.0010858474997803569, |
| "learning_rate": 2.361063094116293e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 351272, |
| "step": 1280 |
| }, |
| { |
| "epoch": 11.371681415929203, |
| "grad_norm": 0.004272403661161661, |
| "learning_rate": 2.3417905273077756e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 352648, |
| "step": 1285 |
| }, |
| { |
| "epoch": 11.415929203539823, |
| "grad_norm": 0.0008781403303146362, |
| "learning_rate": 2.32252739607014e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 354200, |
| "step": 1290 |
| }, |
| { |
| "epoch": 11.460176991150442, |
| "grad_norm": 0.0074958656914532185, |
| "learning_rate": 2.3032748492514116e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 355544, |
| "step": 1295 |
| }, |
| { |
| "epoch": 11.504424778761061, |
| "grad_norm": 0.0006522294133901596, |
| "learning_rate": 2.2840340350683622e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 357032, |
| "step": 1300 |
| }, |
| { |
| "epoch": 11.548672566371682, |
| "grad_norm": 0.0007591209141537547, |
| "learning_rate": 2.2648061010380346e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 358216, |
| "step": 1305 |
| }, |
| { |
| "epoch": 11.5929203539823, |
| "grad_norm": 0.0011055746581405401, |
| "learning_rate": 2.2455921939093e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 359304, |
| "step": 1310 |
| }, |
| { |
| "epoch": 11.63716814159292, |
| "grad_norm": 0.0007098526693880558, |
| "learning_rate": 2.2263934595944716e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 360776, |
| "step": 1315 |
| }, |
| { |
| "epoch": 11.68141592920354, |
| "grad_norm": 0.0008268329547718167, |
| "learning_rate": 2.207211043100958e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 362152, |
| "step": 1320 |
| }, |
| { |
| "epoch": 11.725663716814159, |
| "grad_norm": 0.0014610046055167913, |
| "learning_rate": 2.188046088462979e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 363704, |
| "step": 1325 |
| }, |
| { |
| "epoch": 11.769911504424778, |
| "grad_norm": 0.0009060031734406948, |
| "learning_rate": 2.1688997386733316e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 365064, |
| "step": 1330 |
| }, |
| { |
| "epoch": 11.814159292035399, |
| "grad_norm": 0.0005430459277704358, |
| "learning_rate": 2.1497731356152286e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 366648, |
| "step": 1335 |
| }, |
| { |
| "epoch": 11.858407079646017, |
| "grad_norm": 0.0004894117009826005, |
| "learning_rate": 2.1306674199941872e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 368744, |
| "step": 1340 |
| }, |
| { |
| "epoch": 11.902654867256636, |
| "grad_norm": 0.0028311186470091343, |
| "learning_rate": 2.1115837312700088e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 370296, |
| "step": 1345 |
| }, |
| { |
| "epoch": 11.946902654867257, |
| "grad_norm": 0.002127131912857294, |
| "learning_rate": 2.0925232075888143e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 371608, |
| "step": 1350 |
| }, |
| { |
| "epoch": 11.991150442477876, |
| "grad_norm": 0.006925003137439489, |
| "learning_rate": 2.0734869857151666e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 372840, |
| "step": 1355 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.3775991201400757, |
| "eval_runtime": 0.704, |
| "eval_samples_per_second": 35.514, |
| "eval_steps_per_second": 18.467, |
| "num_input_tokens_seen": 372952, |
| "step": 1356 |
| }, |
| { |
| "epoch": 12.035398230088495, |
| "grad_norm": 0.0007815115968696773, |
| "learning_rate": 2.054476200964278e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 374136, |
| "step": 1360 |
| }, |
| { |
| "epoch": 12.079646017699115, |
| "grad_norm": 0.0009184715454466641, |
| "learning_rate": 2.035491987134294e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 375528, |
| "step": 1365 |
| }, |
| { |
| "epoch": 12.123893805309734, |
| "grad_norm": 0.0004120321536902338, |
| "learning_rate": 2.0165354764386807e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 376952, |
| "step": 1370 |
| }, |
| { |
| "epoch": 12.168141592920353, |
| "grad_norm": 0.0027147261425852776, |
| "learning_rate": 1.997607799438694e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 378696, |
| "step": 1375 |
| }, |
| { |
| "epoch": 12.212389380530974, |
| "grad_norm": 0.013260483741760254, |
| "learning_rate": 1.978710084975959e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 379880, |
| "step": 1380 |
| }, |
| { |
| "epoch": 12.256637168141593, |
| "grad_norm": 0.0005999490385875106, |
| "learning_rate": 1.9598434601051386e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 380936, |
| "step": 1385 |
| }, |
| { |
| "epoch": 12.300884955752213, |
| "grad_norm": 0.0005612107925117016, |
| "learning_rate": 1.941009050026726e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 382200, |
| "step": 1390 |
| }, |
| { |
| "epoch": 12.345132743362832, |
| "grad_norm": 0.0006173701258376241, |
| "learning_rate": 1.922207978019928e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 383416, |
| "step": 1395 |
| }, |
| { |
| "epoch": 12.389380530973451, |
| "grad_norm": 0.0005518092657439411, |
| "learning_rate": 1.903441365375681e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 384952, |
| "step": 1400 |
| }, |
| { |
| "epoch": 12.43362831858407, |
| "grad_norm": 0.0005842981045134366, |
| "learning_rate": 1.884710331329772e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 386312, |
| "step": 1405 |
| }, |
| { |
| "epoch": 12.47787610619469, |
| "grad_norm": 0.0008977102697826922, |
| "learning_rate": 1.8660159929960914e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 387752, |
| "step": 1410 |
| }, |
| { |
| "epoch": 12.52212389380531, |
| "grad_norm": 0.0008995356620289385, |
| "learning_rate": 1.847359465300006e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 389480, |
| "step": 1415 |
| }, |
| { |
| "epoch": 12.56637168141593, |
| "grad_norm": 0.0004171531763859093, |
| "learning_rate": 1.828741860911867e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 390888, |
| "step": 1420 |
| }, |
| { |
| "epoch": 12.610619469026549, |
| "grad_norm": 0.0005489347968250513, |
| "learning_rate": 1.8101642901806486e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 392520, |
| "step": 1425 |
| }, |
| { |
| "epoch": 12.654867256637168, |
| "grad_norm": 0.00042922698776237667, |
| "learning_rate": 1.791627861067731e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 393848, |
| "step": 1430 |
| }, |
| { |
| "epoch": 12.699115044247787, |
| "grad_norm": 0.0006667824345640838, |
| "learning_rate": 1.7731336790808146e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 395032, |
| "step": 1435 |
| }, |
| { |
| "epoch": 12.743362831858407, |
| "grad_norm": 0.001416303333826363, |
| "learning_rate": 1.7546828472079992e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 396104, |
| "step": 1440 |
| }, |
| { |
| "epoch": 12.787610619469026, |
| "grad_norm": 0.003523145103827119, |
| "learning_rate": 1.7362764658519877e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 397432, |
| "step": 1445 |
| }, |
| { |
| "epoch": 12.831858407079647, |
| "grad_norm": 0.0027348888106644154, |
| "learning_rate": 1.7179156327644724e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 398920, |
| "step": 1450 |
| }, |
| { |
| "epoch": 12.876106194690266, |
| "grad_norm": 0.00041539667290635407, |
| "learning_rate": 1.699601442980655e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 400360, |
| "step": 1455 |
| }, |
| { |
| "epoch": 12.920353982300885, |
| "grad_norm": 0.00043466160423122346, |
| "learning_rate": 1.6813349887539443e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 401496, |
| "step": 1460 |
| }, |
| { |
| "epoch": 12.964601769911503, |
| "grad_norm": 0.010956169106066227, |
| "learning_rate": 1.663117359490814e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 402984, |
| "step": 1465 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.38609811663627625, |
| "eval_runtime": 0.7052, |
| "eval_samples_per_second": 35.453, |
| "eval_steps_per_second": 18.435, |
| "num_input_tokens_seen": 403768, |
| "step": 1469 |
| }, |
| { |
| "epoch": 13.008849557522124, |
| "grad_norm": 0.000568145711440593, |
| "learning_rate": 1.6449496416858284e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 403976, |
| "step": 1470 |
| }, |
| { |
| "epoch": 13.053097345132743, |
| "grad_norm": 0.0006162641802802682, |
| "learning_rate": 1.6268329188568468e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 405320, |
| "step": 1475 |
| }, |
| { |
| "epoch": 13.097345132743364, |
| "grad_norm": 0.0005450973985716701, |
| "learning_rate": 1.6087682714804002e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 406664, |
| "step": 1480 |
| }, |
| { |
| "epoch": 13.141592920353983, |
| "grad_norm": 0.00032244238536804914, |
| "learning_rate": 1.5907567769272568e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 408456, |
| "step": 1485 |
| }, |
| { |
| "epoch": 13.185840707964601, |
| "grad_norm": 0.00024698488414287567, |
| "learning_rate": 1.5727995093981598e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 409752, |
| "step": 1490 |
| }, |
| { |
| "epoch": 13.230088495575222, |
| "grad_norm": 0.0013695581583306193, |
| "learning_rate": 1.5548975398597718e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 411048, |
| "step": 1495 |
| }, |
| { |
| "epoch": 13.274336283185841, |
| "grad_norm": 0.01129310205578804, |
| "learning_rate": 1.537051935980794e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 412504, |
| "step": 1500 |
| }, |
| { |
| "epoch": 13.31858407079646, |
| "grad_norm": 0.00111327541526407, |
| "learning_rate": 1.5192637620682981e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 413848, |
| "step": 1505 |
| }, |
| { |
| "epoch": 13.36283185840708, |
| "grad_norm": 0.0006722989492118359, |
| "learning_rate": 1.5015340790042446e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 415176, |
| "step": 1510 |
| }, |
| { |
| "epoch": 13.4070796460177, |
| "grad_norm": 0.0017885168781504035, |
| "learning_rate": 1.4838639441822183e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 416392, |
| "step": 1515 |
| }, |
| { |
| "epoch": 13.451327433628318, |
| "grad_norm": 0.0010682865977287292, |
| "learning_rate": 1.46625441144436e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 417848, |
| "step": 1520 |
| }, |
| { |
| "epoch": 13.495575221238939, |
| "grad_norm": 0.0004911267897114158, |
| "learning_rate": 1.4487065310185202e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 419288, |
| "step": 1525 |
| }, |
| { |
| "epoch": 13.539823008849558, |
| "grad_norm": 0.0005913931527175009, |
| "learning_rate": 1.4312213494556218e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 420504, |
| "step": 1530 |
| }, |
| { |
| "epoch": 13.584070796460177, |
| "grad_norm": 0.0009214850142598152, |
| "learning_rate": 1.4137999095672444e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 421656, |
| "step": 1535 |
| }, |
| { |
| "epoch": 13.628318584070797, |
| "grad_norm": 0.0005110717611387372, |
| "learning_rate": 1.3964432503634281e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 422792, |
| "step": 1540 |
| }, |
| { |
| "epoch": 13.672566371681416, |
| "grad_norm": 0.004927432630211115, |
| "learning_rate": 1.3791524069907141e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 424168, |
| "step": 1545 |
| }, |
| { |
| "epoch": 13.716814159292035, |
| "grad_norm": 0.0004478210466913879, |
| "learning_rate": 1.361928410670403e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 425400, |
| "step": 1550 |
| }, |
| { |
| "epoch": 13.761061946902656, |
| "grad_norm": 0.0008229210507124662, |
| "learning_rate": 1.3447722886370565e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 426984, |
| "step": 1555 |
| }, |
| { |
| "epoch": 13.805309734513274, |
| "grad_norm": 0.00203742366284132, |
| "learning_rate": 1.3276850640772288e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 428296, |
| "step": 1560 |
| }, |
| { |
| "epoch": 13.849557522123893, |
| "grad_norm": 0.00038946408312767744, |
| "learning_rate": 1.3106677560684494e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 429752, |
| "step": 1565 |
| }, |
| { |
| "epoch": 13.893805309734514, |
| "grad_norm": 0.0007134500774554908, |
| "learning_rate": 1.2937213795184434e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 431208, |
| "step": 1570 |
| }, |
| { |
| "epoch": 13.938053097345133, |
| "grad_norm": 0.0004365367931313813, |
| "learning_rate": 1.2768469451046029e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 432792, |
| "step": 1575 |
| }, |
| { |
| "epoch": 13.982300884955752, |
| "grad_norm": 0.00040046818321570754, |
| "learning_rate": 1.2600454592137062e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 434408, |
| "step": 1580 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.39140841364860535, |
| "eval_runtime": 0.7065, |
| "eval_samples_per_second": 35.383, |
| "eval_steps_per_second": 18.399, |
| "num_input_tokens_seen": 434704, |
| "step": 1582 |
| }, |
| { |
| "epoch": 14.026548672566372, |
| "grad_norm": 0.0027648855466395617, |
| "learning_rate": 1.2433179238819077e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 435488, |
| "step": 1585 |
| }, |
| { |
| "epoch": 14.070796460176991, |
| "grad_norm": 0.00045209741801954806, |
| "learning_rate": 1.2266653367349657e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 436816, |
| "step": 1590 |
| }, |
| { |
| "epoch": 14.11504424778761, |
| "grad_norm": 0.0004914466990157962, |
| "learning_rate": 1.2100886909287478e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 438352, |
| "step": 1595 |
| }, |
| { |
| "epoch": 14.15929203539823, |
| "grad_norm": 0.001412543118931353, |
| "learning_rate": 1.1935889750900034e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 439520, |
| "step": 1600 |
| }, |
| { |
| "epoch": 14.20353982300885, |
| "grad_norm": 0.0009035487892106175, |
| "learning_rate": 1.1771671732573976e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 440928, |
| "step": 1605 |
| }, |
| { |
| "epoch": 14.247787610619469, |
| "grad_norm": 0.000611193710938096, |
| "learning_rate": 1.1608242648228257e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 442320, |
| "step": 1610 |
| }, |
| { |
| "epoch": 14.29203539823009, |
| "grad_norm": 0.010955613106489182, |
| "learning_rate": 1.1445612244729984e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 443888, |
| "step": 1615 |
| }, |
| { |
| "epoch": 14.336283185840708, |
| "grad_norm": 0.0007344707264564931, |
| "learning_rate": 1.1283790221313208e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 445264, |
| "step": 1620 |
| }, |
| { |
| "epoch": 14.380530973451327, |
| "grad_norm": 0.0013769727665930986, |
| "learning_rate": 1.1122786229000356e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 446800, |
| "step": 1625 |
| }, |
| { |
| "epoch": 14.424778761061948, |
| "grad_norm": 0.0010539049981161952, |
| "learning_rate": 1.0962609870026724e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 448256, |
| "step": 1630 |
| }, |
| { |
| "epoch": 14.469026548672566, |
| "grad_norm": 0.001179005135782063, |
| "learning_rate": 1.0803270697267764e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 449904, |
| "step": 1635 |
| }, |
| { |
| "epoch": 14.513274336283185, |
| "grad_norm": 0.0009212405420839787, |
| "learning_rate": 1.0644778213669385e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 451216, |
| "step": 1640 |
| }, |
| { |
| "epoch": 14.557522123893806, |
| "grad_norm": 0.0004893370205536485, |
| "learning_rate": 1.0487141871681142e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 452800, |
| "step": 1645 |
| }, |
| { |
| "epoch": 14.601769911504425, |
| "grad_norm": 0.00043644453398883343, |
| "learning_rate": 1.0330371072692565e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 454208, |
| "step": 1650 |
| }, |
| { |
| "epoch": 14.646017699115044, |
| "grad_norm": 0.012024251744151115, |
| "learning_rate": 1.0174475166472417e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 455312, |
| "step": 1655 |
| }, |
| { |
| "epoch": 14.690265486725664, |
| "grad_norm": 0.0005725009250454605, |
| "learning_rate": 1.0019463450611103e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 456576, |
| "step": 1660 |
| }, |
| { |
| "epoch": 14.734513274336283, |
| "grad_norm": 0.0003146920935250819, |
| "learning_rate": 9.865345169966114e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 457808, |
| "step": 1665 |
| }, |
| { |
| "epoch": 14.778761061946902, |
| "grad_norm": 0.0005140115972608328, |
| "learning_rate": 9.71212951611074e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 459296, |
| "step": 1670 |
| }, |
| { |
| "epoch": 14.823008849557523, |
| "grad_norm": 0.0004598709929268807, |
| "learning_rate": 9.559825626785837e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 460784, |
| "step": 1675 |
| }, |
| { |
| "epoch": 14.867256637168142, |
| "grad_norm": 0.0003896567795891315, |
| "learning_rate": 9.40844258535487e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 462304, |
| "step": 1680 |
| }, |
| { |
| "epoch": 14.91150442477876, |
| "grad_norm": 0.003628954291343689, |
| "learning_rate": 9.257989420262151e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 463456, |
| "step": 1685 |
| }, |
| { |
| "epoch": 14.955752212389381, |
| "grad_norm": 0.00039532355731353164, |
| "learning_rate": 9.108475104494475e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 464672, |
| "step": 1690 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.000577093567699194, |
| "learning_rate": 8.959908555045846e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 466016, |
| "step": 1695 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.39010700583457947, |
| "eval_runtime": 0.7102, |
| "eval_samples_per_second": 35.202, |
| "eval_steps_per_second": 18.305, |
| "num_input_tokens_seen": 466016, |
| "step": 1695 |
| }, |
| { |
| "epoch": 15.044247787610619, |
| "grad_norm": 0.0005454930360428989, |
| "learning_rate": 8.812298632385784e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 467328, |
| "step": 1700 |
| }, |
| { |
| "epoch": 15.08849557522124, |
| "grad_norm": 0.00047806225484237075, |
| "learning_rate": 8.66565413993082e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 468624, |
| "step": 1705 |
| }, |
| { |
| "epoch": 15.132743362831858, |
| "grad_norm": 0.00037900806637480855, |
| "learning_rate": 8.519983823519496e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 470400, |
| "step": 1710 |
| }, |
| { |
| "epoch": 15.176991150442477, |
| "grad_norm": 0.0004916319157928228, |
| "learning_rate": 8.375296370890749e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 471568, |
| "step": 1715 |
| }, |
| { |
| "epoch": 15.221238938053098, |
| "grad_norm": 0.00044859075569547713, |
| "learning_rate": 8.231600411165757e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 473376, |
| "step": 1720 |
| }, |
| { |
| "epoch": 15.265486725663717, |
| "grad_norm": 0.0003034193068742752, |
| "learning_rate": 8.088904514333384e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 474752, |
| "step": 1725 |
| }, |
| { |
| "epoch": 15.309734513274336, |
| "grad_norm": 0.0005404249532148242, |
| "learning_rate": 7.947217190738945e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 476224, |
| "step": 1730 |
| }, |
| { |
| "epoch": 15.353982300884956, |
| "grad_norm": 0.0011344121303409338, |
| "learning_rate": 7.806546890576753e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 477504, |
| "step": 1735 |
| }, |
| { |
| "epoch": 15.398230088495575, |
| "grad_norm": 0.0005183634348213673, |
| "learning_rate": 7.666902003386104e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 479312, |
| "step": 1740 |
| }, |
| { |
| "epoch": 15.442477876106194, |
| "grad_norm": 0.0027479766868054867, |
| "learning_rate": 7.528290857550943e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 480672, |
| "step": 1745 |
| }, |
| { |
| "epoch": 15.486725663716815, |
| "grad_norm": 0.00043079385068267584, |
| "learning_rate": 7.390721719803137e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 481840, |
| "step": 1750 |
| }, |
| { |
| "epoch": 15.530973451327434, |
| "grad_norm": 0.0007008668035268784, |
| "learning_rate": 7.254202794729484e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 483040, |
| "step": 1755 |
| }, |
| { |
| "epoch": 15.575221238938052, |
| "grad_norm": 0.0003630095161497593, |
| "learning_rate": 7.11874222428238e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 484384, |
| "step": 1760 |
| }, |
| { |
| "epoch": 15.619469026548673, |
| "grad_norm": 0.0003840439021587372, |
| "learning_rate": 6.9843480872942294e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 485648, |
| "step": 1765 |
| }, |
| { |
| "epoch": 15.663716814159292, |
| "grad_norm": 0.000495024723932147, |
| "learning_rate": 6.851028398995607e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 486832, |
| "step": 1770 |
| }, |
| { |
| "epoch": 15.70796460176991, |
| "grad_norm": 0.0003455649421084672, |
| "learning_rate": 6.718791110537287e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 488240, |
| "step": 1775 |
| }, |
| { |
| "epoch": 15.752212389380531, |
| "grad_norm": 0.0005622297758236527, |
| "learning_rate": 6.587644108515986e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 489600, |
| "step": 1780 |
| }, |
| { |
| "epoch": 15.79646017699115, |
| "grad_norm": 0.003728945506736636, |
| "learning_rate": 6.457595214504042e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 491344, |
| "step": 1785 |
| }, |
| { |
| "epoch": 15.84070796460177, |
| "grad_norm": 0.0030601960606873035, |
| "learning_rate": 6.328652184582884e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 492656, |
| "step": 1790 |
| }, |
| { |
| "epoch": 15.88495575221239, |
| "grad_norm": 0.0007764626061543822, |
| "learning_rate": 6.200822708880563e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 493968, |
| "step": 1795 |
| }, |
| { |
| "epoch": 15.929203539823009, |
| "grad_norm": 0.0006884140893816948, |
| "learning_rate": 6.074114411112997e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 495216, |
| "step": 1800 |
| }, |
| { |
| "epoch": 15.973451327433628, |
| "grad_norm": 0.00040875590639188886, |
| "learning_rate": 5.948534848129378e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 496592, |
| "step": 1805 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.39420193433761597, |
| "eval_runtime": 0.704, |
| "eval_samples_per_second": 35.514, |
| "eval_steps_per_second": 18.467, |
| "num_input_tokens_seen": 497200, |
| "step": 1808 |
| }, |
| { |
| "epoch": 16.01769911504425, |
| "grad_norm": 0.0015643464867025614, |
| "learning_rate": 5.824091509461449e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 497632, |
| "step": 1810 |
| }, |
| { |
| "epoch": 16.061946902654867, |
| "grad_norm": 0.0011178962886333466, |
| "learning_rate": 5.7007918168768405e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 499392, |
| "step": 1815 |
| }, |
| { |
| "epoch": 16.106194690265486, |
| "grad_norm": 0.0008782587246969342, |
| "learning_rate": 5.5786431239364365e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 500736, |
| "step": 1820 |
| }, |
| { |
| "epoch": 16.150442477876105, |
| "grad_norm": 0.00287413178011775, |
| "learning_rate": 5.457652715555781e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 501840, |
| "step": 1825 |
| }, |
| { |
| "epoch": 16.194690265486727, |
| "grad_norm": 0.0021151225082576275, |
| "learning_rate": 5.337827807570689e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 503056, |
| "step": 1830 |
| }, |
| { |
| "epoch": 16.238938053097346, |
| "grad_norm": 0.01046368945389986, |
| "learning_rate": 5.219175546306784e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 504752, |
| "step": 1835 |
| }, |
| { |
| "epoch": 16.283185840707965, |
| "grad_norm": 0.0012730384478345513, |
| "learning_rate": 5.1017030081533914e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 505840, |
| "step": 1840 |
| }, |
| { |
| "epoch": 16.327433628318584, |
| "grad_norm": 0.0024244447704404593, |
| "learning_rate": 4.985417199141443e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 507552, |
| "step": 1845 |
| }, |
| { |
| "epoch": 16.371681415929203, |
| "grad_norm": 0.0004395665309857577, |
| "learning_rate": 4.870325054525673e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 509040, |
| "step": 1850 |
| }, |
| { |
| "epoch": 16.41592920353982, |
| "grad_norm": 0.0088771628215909, |
| "learning_rate": 4.7564334383709745e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 510368, |
| "step": 1855 |
| }, |
| { |
| "epoch": 16.460176991150444, |
| "grad_norm": 0.004274384584277868, |
| "learning_rate": 4.6437491431430556e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 511744, |
| "step": 1860 |
| }, |
| { |
| "epoch": 16.504424778761063, |
| "grad_norm": 0.0006105461507104337, |
| "learning_rate": 4.5322788893033155e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 512992, |
| "step": 1865 |
| }, |
| { |
| "epoch": 16.548672566371682, |
| "grad_norm": 0.000467473961180076, |
| "learning_rate": 4.422029324908061e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 514880, |
| "step": 1870 |
| }, |
| { |
| "epoch": 16.5929203539823, |
| "grad_norm": 0.00047930245636962354, |
| "learning_rate": 4.313007025211985e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 516128, |
| "step": 1875 |
| }, |
| { |
| "epoch": 16.63716814159292, |
| "grad_norm": 0.004217915236949921, |
| "learning_rate": 4.205218492276055e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 517344, |
| "step": 1880 |
| }, |
| { |
| "epoch": 16.68141592920354, |
| "grad_norm": 0.0005477258819155395, |
| "learning_rate": 4.098670154579715e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 518528, |
| "step": 1885 |
| }, |
| { |
| "epoch": 16.72566371681416, |
| "grad_norm": 0.0005239736055955291, |
| "learning_rate": 3.9933683666374986e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 520272, |
| "step": 1890 |
| }, |
| { |
| "epoch": 16.76991150442478, |
| "grad_norm": 0.0002867616422008723, |
| "learning_rate": 3.889319408620021e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 521664, |
| "step": 1895 |
| }, |
| { |
| "epoch": 16.8141592920354, |
| "grad_norm": 0.0004365715431049466, |
| "learning_rate": 3.7865294859794926e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 522960, |
| "step": 1900 |
| }, |
| { |
| "epoch": 16.858407079646017, |
| "grad_norm": 0.0006399775156751275, |
| "learning_rate": 3.68500472907955e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 524288, |
| "step": 1905 |
| }, |
| { |
| "epoch": 16.902654867256636, |
| "grad_norm": 0.005013927351683378, |
| "learning_rate": 3.584751192829705e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 525680, |
| "step": 1910 |
| }, |
| { |
| "epoch": 16.946902654867255, |
| "grad_norm": 0.0012623013462871313, |
| "learning_rate": 3.4857748563242006e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 527072, |
| "step": 1915 |
| }, |
| { |
| "epoch": 16.991150442477878, |
| "grad_norm": 0.007140376605093479, |
| "learning_rate": 3.388081622485431e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 528176, |
| "step": 1920 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.3934271037578583, |
| "eval_runtime": 0.7122, |
| "eval_samples_per_second": 35.103, |
| "eval_steps_per_second": 18.253, |
| "num_input_tokens_seen": 528320, |
| "step": 1921 |
| }, |
| { |
| "epoch": 17.035398230088497, |
| "grad_norm": 0.0004315488622523844, |
| "learning_rate": 3.2916773177118778e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 529472, |
| "step": 1925 |
| }, |
| { |
| "epoch": 17.079646017699115, |
| "grad_norm": 0.0008599446737207472, |
| "learning_rate": 3.1965676915306384e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 530592, |
| "step": 1930 |
| }, |
| { |
| "epoch": 17.123893805309734, |
| "grad_norm": 0.00048269209219142795, |
| "learning_rate": 3.102758416254545e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 531712, |
| "step": 1935 |
| }, |
| { |
| "epoch": 17.168141592920353, |
| "grad_norm": 0.00044269204954616725, |
| "learning_rate": 3.010255086643818e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 533136, |
| "step": 1940 |
| }, |
| { |
| "epoch": 17.212389380530972, |
| "grad_norm": 0.0002626892237458378, |
| "learning_rate": 2.919063219572438e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 534368, |
| "step": 1945 |
| }, |
| { |
| "epoch": 17.256637168141594, |
| "grad_norm": 0.0005254297866486013, |
| "learning_rate": 2.829188253699111e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 536016, |
| "step": 1950 |
| }, |
| { |
| "epoch": 17.300884955752213, |
| "grad_norm": 0.00026018652715720236, |
| "learning_rate": 2.7406355491429086e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 537376, |
| "step": 1955 |
| }, |
| { |
| "epoch": 17.345132743362832, |
| "grad_norm": 0.0006533955456689, |
| "learning_rate": 2.653410387163574e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 538976, |
| "step": 1960 |
| }, |
| { |
| "epoch": 17.38938053097345, |
| "grad_norm": 0.00046180252684280276, |
| "learning_rate": 2.567517969846575e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 540448, |
| "step": 1965 |
| }, |
| { |
| "epoch": 17.43362831858407, |
| "grad_norm": 0.0002887235314119607, |
| "learning_rate": 2.482963419792844e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 541744, |
| "step": 1970 |
| }, |
| { |
| "epoch": 17.47787610619469, |
| "grad_norm": 0.0005816632765345275, |
| "learning_rate": 2.399751779813264e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 542928, |
| "step": 1975 |
| }, |
| { |
| "epoch": 17.52212389380531, |
| "grad_norm": 0.005728741642087698, |
| "learning_rate": 2.317888012627914e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 544208, |
| "step": 1980 |
| }, |
| { |
| "epoch": 17.56637168141593, |
| "grad_norm": 0.0010766932973638177, |
| "learning_rate": 2.2373770005700955e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 545312, |
| "step": 1985 |
| }, |
| { |
| "epoch": 17.61061946902655, |
| "grad_norm": 0.0014621912268921733, |
| "learning_rate": 2.1582235452951682e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 546816, |
| "step": 1990 |
| }, |
| { |
| "epoch": 17.654867256637168, |
| "grad_norm": 0.0018646982498466969, |
| "learning_rate": 2.0804323674941563e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 547888, |
| "step": 1995 |
| }, |
| { |
| "epoch": 17.699115044247787, |
| "grad_norm": 0.0007122437236830592, |
| "learning_rate": 2.0040081066122043e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 549360, |
| "step": 2000 |
| }, |
| { |
| "epoch": 17.743362831858406, |
| "grad_norm": 0.00036821604589931667, |
| "learning_rate": 1.9289553205719317e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 550688, |
| "step": 2005 |
| }, |
| { |
| "epoch": 17.787610619469028, |
| "grad_norm": 0.005967443808913231, |
| "learning_rate": 1.8552784855015215e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 552144, |
| "step": 2010 |
| }, |
| { |
| "epoch": 17.831858407079647, |
| "grad_norm": 0.0005748691037297249, |
| "learning_rate": 1.7829819954678361e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 553616, |
| "step": 2015 |
| }, |
| { |
| "epoch": 17.876106194690266, |
| "grad_norm": 0.0003571947163436562, |
| "learning_rate": 1.7120701622143132e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 555264, |
| "step": 2020 |
| }, |
| { |
| "epoch": 17.920353982300885, |
| "grad_norm": 0.0006318918312899768, |
| "learning_rate": 1.6425472149038361e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 557008, |
| "step": 2025 |
| }, |
| { |
| "epoch": 17.964601769911503, |
| "grad_norm": 0.0003092987753916532, |
| "learning_rate": 1.5744172998664902e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 558560, |
| "step": 2030 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.3990587592124939, |
| "eval_runtime": 0.7089, |
| "eval_samples_per_second": 35.264, |
| "eval_steps_per_second": 18.337, |
| "num_input_tokens_seen": 559408, |
| "step": 2034 |
| }, |
| { |
| "epoch": 18.008849557522122, |
| "grad_norm": 0.0024437231477349997, |
| "learning_rate": 1.5076844803522922e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 559616, |
| "step": 2035 |
| }, |
| { |
| "epoch": 18.053097345132745, |
| "grad_norm": 0.0005437818472273648, |
| "learning_rate": 1.4423527362888546e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 560928, |
| "step": 2040 |
| }, |
| { |
| "epoch": 18.097345132743364, |
| "grad_norm": 0.0004325391782913357, |
| "learning_rate": 1.3784259640440279e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 562208, |
| "step": 2045 |
| }, |
| { |
| "epoch": 18.141592920353983, |
| "grad_norm": 0.007978626526892185, |
| "learning_rate": 1.3159079761934923e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 563792, |
| "step": 2050 |
| }, |
| { |
| "epoch": 18.1858407079646, |
| "grad_norm": 0.0003740122483577579, |
| "learning_rate": 1.2548025012934367e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 565056, |
| "step": 2055 |
| }, |
| { |
| "epoch": 18.23008849557522, |
| "grad_norm": 0.0004064972454216331, |
| "learning_rate": 1.195113183658131e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 566176, |
| "step": 2060 |
| }, |
| { |
| "epoch": 18.27433628318584, |
| "grad_norm": 0.0005885816644877195, |
| "learning_rate": 1.1368435831426021e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 567888, |
| "step": 2065 |
| }, |
| { |
| "epoch": 18.31858407079646, |
| "grad_norm": 0.0008726578671485186, |
| "learning_rate": 1.0799971749303333e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 569488, |
| "step": 2070 |
| }, |
| { |
| "epoch": 18.36283185840708, |
| "grad_norm": 0.0007129801670089364, |
| "learning_rate": 1.0245773493259946e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 570640, |
| "step": 2075 |
| }, |
| { |
| "epoch": 18.4070796460177, |
| "grad_norm": 0.0004051835567224771, |
| "learning_rate": 9.705874115532532e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 572096, |
| "step": 2080 |
| }, |
| { |
| "epoch": 18.451327433628318, |
| "grad_norm": 0.002030750969424844, |
| "learning_rate": 9.180305815576301e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 573680, |
| "step": 2085 |
| }, |
| { |
| "epoch": 18.495575221238937, |
| "grad_norm": 0.0014132376527413726, |
| "learning_rate": 8.669099938144992e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 575056, |
| "step": 2090 |
| }, |
| { |
| "epoch": 18.539823008849556, |
| "grad_norm": 0.0003164079098496586, |
| "learning_rate": 8.172286971421167e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 576320, |
| "step": 2095 |
| }, |
| { |
| "epoch": 18.58407079646018, |
| "grad_norm": 0.004243460018187761, |
| "learning_rate": 7.689896545198111e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 577712, |
| "step": 2100 |
| }, |
| { |
| "epoch": 18.628318584070797, |
| "grad_norm": 0.0005824628169648349, |
| "learning_rate": 7.221957429112469e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 578784, |
| "step": 2105 |
| }, |
| { |
| "epoch": 18.672566371681416, |
| "grad_norm": 0.001415911945514381, |
| "learning_rate": 6.768497530928785e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 580048, |
| "step": 2110 |
| }, |
| { |
| "epoch": 18.716814159292035, |
| "grad_norm": 0.0010125319240614772, |
| "learning_rate": 6.329543894874779e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 581344, |
| "step": 2115 |
| }, |
| { |
| "epoch": 18.761061946902654, |
| "grad_norm": 0.0009618778130970895, |
| "learning_rate": 5.905122700028576e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 582624, |
| "step": 2120 |
| }, |
| { |
| "epoch": 18.805309734513273, |
| "grad_norm": 0.0005287949461489916, |
| "learning_rate": 5.49525925875738e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 584288, |
| "step": 2125 |
| }, |
| { |
| "epoch": 18.849557522123895, |
| "grad_norm": 0.0004833740240428597, |
| "learning_rate": 5.099978015207868e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 585488, |
| "step": 2130 |
| }, |
| { |
| "epoch": 18.893805309734514, |
| "grad_norm": 0.0004988937871530652, |
| "learning_rate": 4.719302543848225e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 586992, |
| "step": 2135 |
| }, |
| { |
| "epoch": 18.938053097345133, |
| "grad_norm": 0.006551985163241625, |
| "learning_rate": 4.3532555480624295e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 588800, |
| "step": 2140 |
| }, |
| { |
| "epoch": 18.98230088495575, |
| "grad_norm": 0.0003706456918735057, |
| "learning_rate": 4.001858858795893e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 590176, |
| "step": 2145 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.3979104459285736, |
| "eval_runtime": 0.7066, |
| "eval_samples_per_second": 35.379, |
| "eval_steps_per_second": 18.397, |
| "num_input_tokens_seen": 590544, |
| "step": 2147 |
| }, |
| { |
| "epoch": 19.02654867256637, |
| "grad_norm": 0.00808491837233305, |
| "learning_rate": 3.665133433253809e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 591376, |
| "step": 2150 |
| }, |
| { |
| "epoch": 19.07079646017699, |
| "grad_norm": 0.006679430603981018, |
| "learning_rate": 3.34309935365093e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 592944, |
| "step": 2155 |
| }, |
| { |
| "epoch": 19.115044247787612, |
| "grad_norm": 0.000536609732080251, |
| "learning_rate": 3.03577582601422e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 594240, |
| "step": 2160 |
| }, |
| { |
| "epoch": 19.15929203539823, |
| "grad_norm": 0.00040848698699846864, |
| "learning_rate": 2.743181179037047e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 595536, |
| "step": 2165 |
| }, |
| { |
| "epoch": 19.20353982300885, |
| "grad_norm": 0.0007450102712027729, |
| "learning_rate": 2.465332862986447e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 596944, |
| "step": 2170 |
| }, |
| { |
| "epoch": 19.24778761061947, |
| "grad_norm": 0.000604320433922112, |
| "learning_rate": 2.2022474486620427e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 598032, |
| "step": 2175 |
| }, |
| { |
| "epoch": 19.292035398230087, |
| "grad_norm": 0.00043978739995509386, |
| "learning_rate": 1.953940626408024e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 599200, |
| "step": 2180 |
| }, |
| { |
| "epoch": 19.336283185840706, |
| "grad_norm": 0.004369417671114206, |
| "learning_rate": 1.720427205177233e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 600752, |
| "step": 2185 |
| }, |
| { |
| "epoch": 19.38053097345133, |
| "grad_norm": 0.0002871988690458238, |
| "learning_rate": 1.5017211116479802e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 602448, |
| "step": 2190 |
| }, |
| { |
| "epoch": 19.424778761061948, |
| "grad_norm": 0.0005315208109095693, |
| "learning_rate": 1.297835389393598e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 603936, |
| "step": 2195 |
| }, |
| { |
| "epoch": 19.469026548672566, |
| "grad_norm": 0.00042117590783163905, |
| "learning_rate": 1.1087821981042856e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 605520, |
| "step": 2200 |
| }, |
| { |
| "epoch": 19.513274336283185, |
| "grad_norm": 0.013347339816391468, |
| "learning_rate": 9.345728128621611e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 606880, |
| "step": 2205 |
| }, |
| { |
| "epoch": 19.557522123893804, |
| "grad_norm": 0.0008802915108390152, |
| "learning_rate": 7.752176234685771e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 608304, |
| "step": 2210 |
| }, |
| { |
| "epoch": 19.601769911504427, |
| "grad_norm": 0.0005139311542734504, |
| "learning_rate": 6.307261338246718e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 609456, |
| "step": 2215 |
| }, |
| { |
| "epoch": 19.646017699115045, |
| "grad_norm": 0.0005398978828452528, |
| "learning_rate": 5.011069613644892e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 611120, |
| "step": 2220 |
| }, |
| { |
| "epoch": 19.690265486725664, |
| "grad_norm": 0.0003701553796418011, |
| "learning_rate": 3.8636783654100174e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 612672, |
| "step": 2225 |
| }, |
| { |
| "epoch": 19.734513274336283, |
| "grad_norm": 0.0003791327471844852, |
| "learning_rate": 2.865156023650617e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 613840, |
| "step": 2230 |
| }, |
| { |
| "epoch": 19.778761061946902, |
| "grad_norm": 0.0003429692005738616, |
| "learning_rate": 2.0155621399742254e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 615184, |
| "step": 2235 |
| }, |
| { |
| "epoch": 19.82300884955752, |
| "grad_norm": 0.0005890704342164099, |
| "learning_rate": 1.31494738393384e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 616432, |
| "step": 2240 |
| }, |
| { |
| "epoch": 19.86725663716814, |
| "grad_norm": 0.0004076314507983625, |
| "learning_rate": 7.633535400070057e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 617904, |
| "step": 2245 |
| }, |
| { |
| "epoch": 19.911504424778762, |
| "grad_norm": 0.00038537936052307487, |
| "learning_rate": 3.6081350510447365e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 619296, |
| "step": 2250 |
| }, |
| { |
| "epoch": 19.95575221238938, |
| "grad_norm": 0.0005400259979069233, |
| "learning_rate": 1.0735128660649406e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 620560, |
| "step": 2255 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.0006321980035863817, |
| "learning_rate": 2.982000932294504e-11, |
| "loss": 0.0, |
| "num_input_tokens_seen": 621640, |
| "step": 2260 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.3967692255973816, |
| "eval_runtime": 0.7049, |
| "eval_samples_per_second": 35.466, |
| "eval_steps_per_second": 18.442, |
| "num_input_tokens_seen": 621640, |
| "step": 2260 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 621640, |
| "step": 2260, |
| "total_flos": 2.799216823861248e+16, |
| "train_loss": 0.15936206933981492, |
| "train_runtime": 253.0981, |
| "train_samples_per_second": 17.78, |
| "train_steps_per_second": 8.929 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2260, |
| "num_input_tokens_seen": 621640, |
| "num_train_epochs": 20, |
| "save_steps": 113, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.799216823861248e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|