{ "best_global_step": 375, "best_metric": 2.8448235988616943, "best_model_checkpoint": "outputs/checkpoint-375", "epoch": 14.970873786407767, "eval_steps": 500, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.038834951456310676, "grad_norm": 21.30242156982422, "learning_rate": 0.0, "loss": 6.5474, "step": 1 }, { "epoch": 0.07766990291262135, "grad_norm": 20.775470733642578, "learning_rate": 2.0000000000000002e-07, "loss": 6.5613, "step": 2 }, { "epoch": 0.11650485436893204, "grad_norm": 20.96541976928711, "learning_rate": 4.0000000000000003e-07, "loss": 6.5127, "step": 3 }, { "epoch": 0.1553398058252427, "grad_norm": 20.376543045043945, "learning_rate": 6.000000000000001e-07, "loss": 6.4569, "step": 4 }, { "epoch": 0.1941747572815534, "grad_norm": 19.54267692565918, "learning_rate": 8.000000000000001e-07, "loss": 6.3743, "step": 5 }, { "epoch": 0.23300970873786409, "grad_norm": 19.233882904052734, "learning_rate": 1.0000000000000002e-06, "loss": 6.3899, "step": 6 }, { "epoch": 0.27184466019417475, "grad_norm": 20.25909423828125, "learning_rate": 1.2000000000000002e-06, "loss": 6.4415, "step": 7 }, { "epoch": 0.3106796116504854, "grad_norm": 19.33000373840332, "learning_rate": 1.4000000000000001e-06, "loss": 6.3191, "step": 8 }, { "epoch": 0.34951456310679613, "grad_norm": 18.305322647094727, "learning_rate": 1.6000000000000001e-06, "loss": 6.2681, "step": 9 }, { "epoch": 0.3883495145631068, "grad_norm": 17.74665069580078, "learning_rate": 1.8e-06, "loss": 6.4206, "step": 10 }, { "epoch": 0.42718446601941745, "grad_norm": 14.93736457824707, "learning_rate": 2.0000000000000003e-06, "loss": 6.2288, "step": 11 }, { "epoch": 0.46601941747572817, "grad_norm": 14.914277076721191, "learning_rate": 2.2e-06, "loss": 6.3961, "step": 12 }, { "epoch": 0.5048543689320388, "grad_norm": 13.266161918640137, "learning_rate": 2.4000000000000003e-06, "loss": 6.0076, "step": 13 }, { "epoch": 0.5436893203883495, "grad_norm": 12.377790451049805, "learning_rate": 2.6e-06, "loss": 6.2259, "step": 14 }, { "epoch": 0.5825242718446602, "grad_norm": 11.322343826293945, "learning_rate": 2.8000000000000003e-06, "loss": 6.1832, "step": 15 }, { "epoch": 0.6213592233009708, "grad_norm": 10.584484100341797, "learning_rate": 3e-06, "loss": 6.0051, "step": 16 }, { "epoch": 0.6601941747572816, "grad_norm": 10.82979965209961, "learning_rate": 3.2000000000000003e-06, "loss": 5.9786, "step": 17 }, { "epoch": 0.6990291262135923, "grad_norm": 10.112428665161133, "learning_rate": 3.4000000000000005e-06, "loss": 5.9778, "step": 18 }, { "epoch": 0.7378640776699029, "grad_norm": 9.44952392578125, "learning_rate": 3.6e-06, "loss": 5.9459, "step": 19 }, { "epoch": 0.7766990291262136, "grad_norm": 9.057659149169922, "learning_rate": 3.8e-06, "loss": 6.0317, "step": 20 }, { "epoch": 0.8155339805825242, "grad_norm": 9.000926971435547, "learning_rate": 4.000000000000001e-06, "loss": 5.9749, "step": 21 }, { "epoch": 0.8543689320388349, "grad_norm": 7.747213840484619, "learning_rate": 4.2000000000000004e-06, "loss": 5.8036, "step": 22 }, { "epoch": 0.8932038834951457, "grad_norm": 6.968072891235352, "learning_rate": 4.4e-06, "loss": 5.7705, "step": 23 }, { "epoch": 0.9320388349514563, "grad_norm": 7.167684555053711, "learning_rate": 4.6e-06, "loss": 5.7804, "step": 24 }, { "epoch": 0.970873786407767, "grad_norm": 6.384294033050537, "learning_rate": 4.800000000000001e-06, "loss": 5.6137, "step": 25 }, { "epoch": 0.970873786407767, "eval_loss": 5.6718363761901855, "eval_runtime": 2.435, "eval_samples_per_second": 9.035, "eval_steps_per_second": 2.464, "step": 25 }, { "epoch": 1.0388349514563107, "grad_norm": 12.656047821044922, "learning_rate": 5e-06, "loss": 11.4741, "step": 26 }, { "epoch": 1.0776699029126213, "grad_norm": 6.408062934875488, "learning_rate": 5.2e-06, "loss": 5.7835, "step": 27 }, { "epoch": 1.116504854368932, "grad_norm": 6.457642078399658, "learning_rate": 5.4e-06, "loss": 5.7659, "step": 28 }, { "epoch": 1.1553398058252426, "grad_norm": 6.716769218444824, "learning_rate": 5.600000000000001e-06, "loss": 5.6133, "step": 29 }, { "epoch": 1.1941747572815533, "grad_norm": 5.562079906463623, "learning_rate": 5.8e-06, "loss": 5.6588, "step": 30 }, { "epoch": 1.233009708737864, "grad_norm": 5.209117412567139, "learning_rate": 6e-06, "loss": 5.6118, "step": 31 }, { "epoch": 1.2718446601941746, "grad_norm": 5.505391597747803, "learning_rate": 6.2e-06, "loss": 5.6468, "step": 32 }, { "epoch": 1.3106796116504853, "grad_norm": 4.989831924438477, "learning_rate": 6.4000000000000006e-06, "loss": 5.5483, "step": 33 }, { "epoch": 1.3495145631067962, "grad_norm": 5.000854015350342, "learning_rate": 6.6e-06, "loss": 5.4522, "step": 34 }, { "epoch": 1.3883495145631068, "grad_norm": 4.343570232391357, "learning_rate": 6.800000000000001e-06, "loss": 5.3562, "step": 35 }, { "epoch": 1.4271844660194175, "grad_norm": 4.40326452255249, "learning_rate": 7.000000000000001e-06, "loss": 5.4561, "step": 36 }, { "epoch": 1.4660194174757282, "grad_norm": 4.1591901779174805, "learning_rate": 7.2e-06, "loss": 5.3806, "step": 37 }, { "epoch": 1.5048543689320388, "grad_norm": 4.1347246170043945, "learning_rate": 7.4e-06, "loss": 5.419, "step": 38 }, { "epoch": 1.5436893203883495, "grad_norm": 4.123111248016357, "learning_rate": 7.6e-06, "loss": 5.2831, "step": 39 }, { "epoch": 1.5825242718446602, "grad_norm": 4.009028911590576, "learning_rate": 7.8e-06, "loss": 5.332, "step": 40 }, { "epoch": 1.6213592233009708, "grad_norm": 4.013438701629639, "learning_rate": 8.000000000000001e-06, "loss": 5.2177, "step": 41 }, { "epoch": 1.6601941747572817, "grad_norm": 3.698003053665161, "learning_rate": 8.200000000000001e-06, "loss": 5.3019, "step": 42 }, { "epoch": 1.6990291262135924, "grad_norm": 3.66217041015625, "learning_rate": 8.400000000000001e-06, "loss": 5.1967, "step": 43 }, { "epoch": 1.737864077669903, "grad_norm": 3.455019235610962, "learning_rate": 8.599999999999999e-06, "loss": 5.1573, "step": 44 }, { "epoch": 1.7766990291262137, "grad_norm": 3.5593278408050537, "learning_rate": 8.8e-06, "loss": 5.1463, "step": 45 }, { "epoch": 1.8155339805825244, "grad_norm": 3.332477331161499, "learning_rate": 9e-06, "loss": 5.1732, "step": 46 }, { "epoch": 1.854368932038835, "grad_norm": 3.2428054809570312, "learning_rate": 9.2e-06, "loss": 5.0962, "step": 47 }, { "epoch": 1.8932038834951457, "grad_norm": 3.339063882827759, "learning_rate": 9.4e-06, "loss": 5.0253, "step": 48 }, { "epoch": 1.9320388349514563, "grad_norm": 3.4746124744415283, "learning_rate": 9.600000000000001e-06, "loss": 5.1363, "step": 49 }, { "epoch": 1.970873786407767, "grad_norm": 3.371466875076294, "learning_rate": 9.800000000000001e-06, "loss": 5.1445, "step": 50 }, { "epoch": 1.970873786407767, "eval_loss": 5.052736282348633, "eval_runtime": 0.9737, "eval_samples_per_second": 22.593, "eval_steps_per_second": 6.162, "step": 50 }, { "epoch": 2.0388349514563107, "grad_norm": 5.6498637199401855, "learning_rate": 1e-05, "loss": 10.112, "step": 51 }, { "epoch": 2.0776699029126213, "grad_norm": 3.1301138401031494, "learning_rate": 1.02e-05, "loss": 5.1063, "step": 52 }, { "epoch": 2.116504854368932, "grad_norm": 3.452958345413208, "learning_rate": 1.04e-05, "loss": 5.0082, "step": 53 }, { "epoch": 2.1553398058252426, "grad_norm": 3.1977169513702393, "learning_rate": 1.06e-05, "loss": 4.9698, "step": 54 }, { "epoch": 2.1941747572815533, "grad_norm": 2.6776535511016846, "learning_rate": 1.08e-05, "loss": 4.9449, "step": 55 }, { "epoch": 2.233009708737864, "grad_norm": 3.5574913024902344, "learning_rate": 1.1000000000000001e-05, "loss": 5.0442, "step": 56 }, { "epoch": 2.2718446601941746, "grad_norm": 2.867915391921997, "learning_rate": 1.1200000000000001e-05, "loss": 4.8769, "step": 57 }, { "epoch": 2.3106796116504853, "grad_norm": 2.764223098754883, "learning_rate": 1.1400000000000001e-05, "loss": 4.9286, "step": 58 }, { "epoch": 2.349514563106796, "grad_norm": 3.816723585128784, "learning_rate": 1.16e-05, "loss": 4.8921, "step": 59 }, { "epoch": 2.3883495145631066, "grad_norm": 3.161980152130127, "learning_rate": 1.18e-05, "loss": 4.916, "step": 60 }, { "epoch": 2.4271844660194173, "grad_norm": 2.8373942375183105, "learning_rate": 1.2e-05, "loss": 4.8942, "step": 61 }, { "epoch": 2.466019417475728, "grad_norm": 2.8898000717163086, "learning_rate": 1.22e-05, "loss": 4.8206, "step": 62 }, { "epoch": 2.5048543689320386, "grad_norm": 2.726362943649292, "learning_rate": 1.24e-05, "loss": 4.846, "step": 63 }, { "epoch": 2.5436893203883493, "grad_norm": 2.73665714263916, "learning_rate": 1.2600000000000001e-05, "loss": 4.8375, "step": 64 }, { "epoch": 2.58252427184466, "grad_norm": 3.1228106021881104, "learning_rate": 1.2800000000000001e-05, "loss": 4.7526, "step": 65 }, { "epoch": 2.6213592233009706, "grad_norm": 2.9702351093292236, "learning_rate": 1.3000000000000001e-05, "loss": 4.8024, "step": 66 }, { "epoch": 2.6601941747572817, "grad_norm": 3.0533952713012695, "learning_rate": 1.32e-05, "loss": 4.7883, "step": 67 }, { "epoch": 2.6990291262135924, "grad_norm": 3.1949095726013184, "learning_rate": 1.3400000000000002e-05, "loss": 4.8197, "step": 68 }, { "epoch": 2.737864077669903, "grad_norm": 3.399998426437378, "learning_rate": 1.3600000000000002e-05, "loss": 4.6677, "step": 69 }, { "epoch": 2.7766990291262137, "grad_norm": 2.80118465423584, "learning_rate": 1.3800000000000002e-05, "loss": 4.6291, "step": 70 }, { "epoch": 2.8155339805825244, "grad_norm": 2.8477330207824707, "learning_rate": 1.4000000000000001e-05, "loss": 4.7767, "step": 71 }, { "epoch": 2.854368932038835, "grad_norm": 2.6895911693573, "learning_rate": 1.42e-05, "loss": 4.7057, "step": 72 }, { "epoch": 2.8932038834951457, "grad_norm": 2.914586067199707, "learning_rate": 1.44e-05, "loss": 4.6386, "step": 73 }, { "epoch": 2.9320388349514563, "grad_norm": 2.6184370517730713, "learning_rate": 1.4599999999999999e-05, "loss": 4.6679, "step": 74 }, { "epoch": 2.970873786407767, "grad_norm": 3.00891375541687, "learning_rate": 1.48e-05, "loss": 4.6319, "step": 75 }, { "epoch": 2.970873786407767, "eval_loss": 4.614713668823242, "eval_runtime": 0.9702, "eval_samples_per_second": 22.675, "eval_steps_per_second": 6.184, "step": 75 }, { "epoch": 3.0388349514563107, "grad_norm": 5.222214221954346, "learning_rate": 1.5e-05, "loss": 9.2212, "step": 76 }, { "epoch": 3.0776699029126213, "grad_norm": 2.716062307357788, "learning_rate": 1.52e-05, "loss": 4.6294, "step": 77 }, { "epoch": 3.116504854368932, "grad_norm": 2.503143548965454, "learning_rate": 1.54e-05, "loss": 4.5572, "step": 78 }, { "epoch": 3.1553398058252426, "grad_norm": 2.9183573722839355, "learning_rate": 1.56e-05, "loss": 4.453, "step": 79 }, { "epoch": 3.1941747572815533, "grad_norm": 2.7854349613189697, "learning_rate": 1.58e-05, "loss": 4.5746, "step": 80 }, { "epoch": 3.233009708737864, "grad_norm": 2.8391106128692627, "learning_rate": 1.6000000000000003e-05, "loss": 4.5228, "step": 81 }, { "epoch": 3.2718446601941746, "grad_norm": 2.5229265689849854, "learning_rate": 1.62e-05, "loss": 4.4692, "step": 82 }, { "epoch": 3.3106796116504853, "grad_norm": 2.643170118331909, "learning_rate": 1.6400000000000002e-05, "loss": 4.498, "step": 83 }, { "epoch": 3.349514563106796, "grad_norm": 2.542393922805786, "learning_rate": 1.66e-05, "loss": 4.4816, "step": 84 }, { "epoch": 3.3883495145631066, "grad_norm": 2.563282012939453, "learning_rate": 1.6800000000000002e-05, "loss": 4.4824, "step": 85 }, { "epoch": 3.4271844660194173, "grad_norm": 2.698516368865967, "learning_rate": 1.7000000000000003e-05, "loss": 4.4717, "step": 86 }, { "epoch": 3.466019417475728, "grad_norm": 2.936776638031006, "learning_rate": 1.7199999999999998e-05, "loss": 4.346, "step": 87 }, { "epoch": 3.5048543689320386, "grad_norm": 2.9594175815582275, "learning_rate": 1.74e-05, "loss": 4.3689, "step": 88 }, { "epoch": 3.5436893203883493, "grad_norm": 3.02431583404541, "learning_rate": 1.76e-05, "loss": 4.3922, "step": 89 }, { "epoch": 3.58252427184466, "grad_norm": 3.238933563232422, "learning_rate": 1.78e-05, "loss": 4.4046, "step": 90 }, { "epoch": 3.6213592233009706, "grad_norm": 3.368084192276001, "learning_rate": 1.8e-05, "loss": 4.3768, "step": 91 }, { "epoch": 3.6601941747572817, "grad_norm": 3.8072586059570312, "learning_rate": 1.8200000000000002e-05, "loss": 4.3188, "step": 92 }, { "epoch": 3.6990291262135924, "grad_norm": 3.2370452880859375, "learning_rate": 1.84e-05, "loss": 4.3368, "step": 93 }, { "epoch": 3.737864077669903, "grad_norm": 3.302961826324463, "learning_rate": 1.86e-05, "loss": 4.3339, "step": 94 }, { "epoch": 3.7766990291262137, "grad_norm": 3.5947256088256836, "learning_rate": 1.88e-05, "loss": 4.2763, "step": 95 }, { "epoch": 3.8155339805825244, "grad_norm": 2.955308437347412, "learning_rate": 1.9e-05, "loss": 4.3941, "step": 96 }, { "epoch": 3.854368932038835, "grad_norm": 3.303628444671631, "learning_rate": 1.9200000000000003e-05, "loss": 4.2748, "step": 97 }, { "epoch": 3.8932038834951457, "grad_norm": 2.7507269382476807, "learning_rate": 1.94e-05, "loss": 4.2881, "step": 98 }, { "epoch": 3.9320388349514563, "grad_norm": 2.6451849937438965, "learning_rate": 1.9600000000000002e-05, "loss": 4.3818, "step": 99 }, { "epoch": 3.970873786407767, "grad_norm": 4.112302780151367, "learning_rate": 1.9800000000000004e-05, "loss": 4.2882, "step": 100 }, { "epoch": 3.970873786407767, "eval_loss": 4.266085624694824, "eval_runtime": 1.036, "eval_samples_per_second": 21.235, "eval_steps_per_second": 5.791, "step": 100 }, { "epoch": 4.038834951456311, "grad_norm": 5.2990498542785645, "learning_rate": 2e-05, "loss": 8.631, "step": 101 }, { "epoch": 4.077669902912621, "grad_norm": 3.757814407348633, "learning_rate": 2.0200000000000003e-05, "loss": 4.2183, "step": 102 }, { "epoch": 4.116504854368932, "grad_norm": 2.905704975128174, "learning_rate": 2.04e-05, "loss": 4.1782, "step": 103 }, { "epoch": 4.155339805825243, "grad_norm": 3.7264492511749268, "learning_rate": 2.06e-05, "loss": 4.2959, "step": 104 }, { "epoch": 4.194174757281553, "grad_norm": 3.9989054203033447, "learning_rate": 2.08e-05, "loss": 4.1876, "step": 105 }, { "epoch": 4.233009708737864, "grad_norm": 2.978239059448242, "learning_rate": 2.1e-05, "loss": 4.1484, "step": 106 }, { "epoch": 4.271844660194175, "grad_norm": 3.223487138748169, "learning_rate": 2.12e-05, "loss": 4.1501, "step": 107 }, { "epoch": 4.310679611650485, "grad_norm": 3.035008668899536, "learning_rate": 2.1400000000000002e-05, "loss": 4.1316, "step": 108 }, { "epoch": 4.349514563106796, "grad_norm": 2.878307819366455, "learning_rate": 2.16e-05, "loss": 4.1824, "step": 109 }, { "epoch": 4.388349514563107, "grad_norm": 3.095815420150757, "learning_rate": 2.18e-05, "loss": 4.1726, "step": 110 }, { "epoch": 4.427184466019417, "grad_norm": 3.0754470825195312, "learning_rate": 2.2000000000000003e-05, "loss": 3.9618, "step": 111 }, { "epoch": 4.466019417475728, "grad_norm": 3.4234559535980225, "learning_rate": 2.22e-05, "loss": 4.0646, "step": 112 }, { "epoch": 4.504854368932039, "grad_norm": 3.2128183841705322, "learning_rate": 2.2400000000000002e-05, "loss": 4.0639, "step": 113 }, { "epoch": 4.543689320388349, "grad_norm": 2.9789934158325195, "learning_rate": 2.26e-05, "loss": 4.1373, "step": 114 }, { "epoch": 4.58252427184466, "grad_norm": 2.5928032398223877, "learning_rate": 2.2800000000000002e-05, "loss": 3.9855, "step": 115 }, { "epoch": 4.621359223300971, "grad_norm": 3.082489252090454, "learning_rate": 2.3000000000000003e-05, "loss": 4.1163, "step": 116 }, { "epoch": 4.660194174757281, "grad_norm": 3.028413772583008, "learning_rate": 2.32e-05, "loss": 4.0571, "step": 117 }, { "epoch": 4.699029126213592, "grad_norm": 2.8744428157806396, "learning_rate": 2.3400000000000003e-05, "loss": 4.027, "step": 118 }, { "epoch": 4.737864077669903, "grad_norm": 2.866056442260742, "learning_rate": 2.36e-05, "loss": 4.0299, "step": 119 }, { "epoch": 4.776699029126213, "grad_norm": 2.75072979927063, "learning_rate": 2.38e-05, "loss": 3.993, "step": 120 }, { "epoch": 4.815533980582524, "grad_norm": 2.8751604557037354, "learning_rate": 2.4e-05, "loss": 3.9961, "step": 121 }, { "epoch": 4.854368932038835, "grad_norm": 2.5905075073242188, "learning_rate": 2.4200000000000002e-05, "loss": 3.9582, "step": 122 }, { "epoch": 4.893203883495145, "grad_norm": 3.143044948577881, "learning_rate": 2.44e-05, "loss": 3.9464, "step": 123 }, { "epoch": 4.932038834951456, "grad_norm": 2.6397016048431396, "learning_rate": 2.46e-05, "loss": 4.0075, "step": 124 }, { "epoch": 4.970873786407767, "grad_norm": 3.2383229732513428, "learning_rate": 2.48e-05, "loss": 3.9822, "step": 125 }, { "epoch": 4.970873786407767, "eval_loss": 3.980665445327759, "eval_runtime": 1.0248, "eval_samples_per_second": 21.467, "eval_steps_per_second": 5.855, "step": 125 }, { "epoch": 5.038834951456311, "grad_norm": 5.962584495544434, "learning_rate": 2.5e-05, "loss": 7.7604, "step": 126 }, { "epoch": 5.077669902912621, "grad_norm": 3.243708610534668, "learning_rate": 2.5200000000000003e-05, "loss": 3.9438, "step": 127 }, { "epoch": 5.116504854368932, "grad_norm": 2.763148307800293, "learning_rate": 2.54e-05, "loss": 3.8661, "step": 128 }, { "epoch": 5.155339805825243, "grad_norm": 2.6233339309692383, "learning_rate": 2.5600000000000002e-05, "loss": 3.9006, "step": 129 }, { "epoch": 5.194174757281553, "grad_norm": 3.1037437915802, "learning_rate": 2.58e-05, "loss": 3.9066, "step": 130 }, { "epoch": 5.233009708737864, "grad_norm": 3.3434383869171143, "learning_rate": 2.6000000000000002e-05, "loss": 3.8425, "step": 131 }, { "epoch": 5.271844660194175, "grad_norm": 3.0016958713531494, "learning_rate": 2.6200000000000003e-05, "loss": 3.8723, "step": 132 }, { "epoch": 5.310679611650485, "grad_norm": 3.2040951251983643, "learning_rate": 2.64e-05, "loss": 3.8326, "step": 133 }, { "epoch": 5.349514563106796, "grad_norm": 3.892890453338623, "learning_rate": 2.6600000000000003e-05, "loss": 3.9277, "step": 134 }, { "epoch": 5.388349514563107, "grad_norm": 3.3505635261535645, "learning_rate": 2.6800000000000004e-05, "loss": 3.7381, "step": 135 }, { "epoch": 5.427184466019417, "grad_norm": 3.60493803024292, "learning_rate": 2.7000000000000002e-05, "loss": 3.9003, "step": 136 }, { "epoch": 5.466019417475728, "grad_norm": 3.3468196392059326, "learning_rate": 2.7200000000000004e-05, "loss": 3.829, "step": 137 }, { "epoch": 5.504854368932039, "grad_norm": 2.7208919525146484, "learning_rate": 2.7400000000000002e-05, "loss": 3.7987, "step": 138 }, { "epoch": 5.543689320388349, "grad_norm": 4.0348920822143555, "learning_rate": 2.7600000000000003e-05, "loss": 3.8318, "step": 139 }, { "epoch": 5.58252427184466, "grad_norm": 3.560403347015381, "learning_rate": 2.7800000000000005e-05, "loss": 3.763, "step": 140 }, { "epoch": 5.621359223300971, "grad_norm": 3.262423515319824, "learning_rate": 2.8000000000000003e-05, "loss": 3.7441, "step": 141 }, { "epoch": 5.660194174757281, "grad_norm": 2.7930023670196533, "learning_rate": 2.8199999999999998e-05, "loss": 3.7323, "step": 142 }, { "epoch": 5.699029126213592, "grad_norm": 2.5322391986846924, "learning_rate": 2.84e-05, "loss": 3.6681, "step": 143 }, { "epoch": 5.737864077669903, "grad_norm": 4.258012294769287, "learning_rate": 2.86e-05, "loss": 3.7049, "step": 144 }, { "epoch": 5.776699029126213, "grad_norm": 3.0756101608276367, "learning_rate": 2.88e-05, "loss": 3.7184, "step": 145 }, { "epoch": 5.815533980582524, "grad_norm": 3.0040361881256104, "learning_rate": 2.9e-05, "loss": 3.6077, "step": 146 }, { "epoch": 5.854368932038835, "grad_norm": 4.292761325836182, "learning_rate": 2.9199999999999998e-05, "loss": 3.7214, "step": 147 }, { "epoch": 5.893203883495145, "grad_norm": 2.876159906387329, "learning_rate": 2.94e-05, "loss": 3.6643, "step": 148 }, { "epoch": 5.932038834951456, "grad_norm": 3.1686434745788574, "learning_rate": 2.96e-05, "loss": 3.68, "step": 149 }, { "epoch": 5.970873786407767, "grad_norm": 3.1515626907348633, "learning_rate": 2.98e-05, "loss": 3.6581, "step": 150 }, { "epoch": 5.970873786407767, "eval_loss": 3.7385447025299072, "eval_runtime": 1.0393, "eval_samples_per_second": 21.169, "eval_steps_per_second": 5.773, "step": 150 }, { "epoch": 6.038834951456311, "grad_norm": 6.013641834259033, "learning_rate": 3e-05, "loss": 7.2601, "step": 151 }, { "epoch": 6.077669902912621, "grad_norm": 3.0433292388916016, "learning_rate": 3.02e-05, "loss": 3.626, "step": 152 }, { "epoch": 6.116504854368932, "grad_norm": 2.9623515605926514, "learning_rate": 3.04e-05, "loss": 3.5856, "step": 153 }, { "epoch": 6.155339805825243, "grad_norm": 3.333615779876709, "learning_rate": 3.06e-05, "loss": 3.6268, "step": 154 }, { "epoch": 6.194174757281553, "grad_norm": 3.0843307971954346, "learning_rate": 3.08e-05, "loss": 3.5651, "step": 155 }, { "epoch": 6.233009708737864, "grad_norm": 2.859063148498535, "learning_rate": 3.1e-05, "loss": 3.5464, "step": 156 }, { "epoch": 6.271844660194175, "grad_norm": 2.92948842048645, "learning_rate": 3.12e-05, "loss": 3.6385, "step": 157 }, { "epoch": 6.310679611650485, "grad_norm": 3.552112340927124, "learning_rate": 3.1400000000000004e-05, "loss": 3.5479, "step": 158 }, { "epoch": 6.349514563106796, "grad_norm": 2.9934771060943604, "learning_rate": 3.16e-05, "loss": 3.5697, "step": 159 }, { "epoch": 6.388349514563107, "grad_norm": 2.595054864883423, "learning_rate": 3.18e-05, "loss": 3.4817, "step": 160 }, { "epoch": 6.427184466019417, "grad_norm": 3.077573537826538, "learning_rate": 3.2000000000000005e-05, "loss": 3.5286, "step": 161 }, { "epoch": 6.466019417475728, "grad_norm": 2.5149052143096924, "learning_rate": 3.2200000000000003e-05, "loss": 3.6065, "step": 162 }, { "epoch": 6.504854368932039, "grad_norm": 2.6401753425598145, "learning_rate": 3.24e-05, "loss": 3.4707, "step": 163 }, { "epoch": 6.543689320388349, "grad_norm": 2.725781202316284, "learning_rate": 3.26e-05, "loss": 3.5645, "step": 164 }, { "epoch": 6.58252427184466, "grad_norm": 2.7084786891937256, "learning_rate": 3.2800000000000004e-05, "loss": 3.5482, "step": 165 }, { "epoch": 6.621359223300971, "grad_norm": 2.6076486110687256, "learning_rate": 3.3e-05, "loss": 3.4842, "step": 166 }, { "epoch": 6.660194174757281, "grad_norm": 3.0237390995025635, "learning_rate": 3.32e-05, "loss": 3.5313, "step": 167 }, { "epoch": 6.699029126213592, "grad_norm": 2.807459831237793, "learning_rate": 3.3400000000000005e-05, "loss": 3.5354, "step": 168 }, { "epoch": 6.737864077669903, "grad_norm": 3.13301420211792, "learning_rate": 3.3600000000000004e-05, "loss": 3.4923, "step": 169 }, { "epoch": 6.776699029126213, "grad_norm": 2.5862674713134766, "learning_rate": 3.38e-05, "loss": 3.5315, "step": 170 }, { "epoch": 6.815533980582524, "grad_norm": 3.192603588104248, "learning_rate": 3.4000000000000007e-05, "loss": 3.4937, "step": 171 }, { "epoch": 6.854368932038835, "grad_norm": 2.440667152404785, "learning_rate": 3.4200000000000005e-05, "loss": 3.4632, "step": 172 }, { "epoch": 6.893203883495145, "grad_norm": 3.0425989627838135, "learning_rate": 3.4399999999999996e-05, "loss": 3.4456, "step": 173 }, { "epoch": 6.932038834951456, "grad_norm": 3.369929313659668, "learning_rate": 3.46e-05, "loss": 3.4061, "step": 174 }, { "epoch": 6.970873786407767, "grad_norm": 2.883514165878296, "learning_rate": 3.48e-05, "loss": 3.4312, "step": 175 }, { "epoch": 6.970873786407767, "eval_loss": 3.5276877880096436, "eval_runtime": 0.9695, "eval_samples_per_second": 22.692, "eval_steps_per_second": 6.189, "step": 175 }, { "epoch": 7.038834951456311, "grad_norm": 5.757262706756592, "learning_rate": 3.5e-05, "loss": 6.8588, "step": 176 }, { "epoch": 7.077669902912621, "grad_norm": 2.7623355388641357, "learning_rate": 3.52e-05, "loss": 3.389, "step": 177 }, { "epoch": 7.116504854368932, "grad_norm": 3.601408004760742, "learning_rate": 3.54e-05, "loss": 3.4136, "step": 178 }, { "epoch": 7.155339805825243, "grad_norm": 2.4193849563598633, "learning_rate": 3.56e-05, "loss": 3.3387, "step": 179 }, { "epoch": 7.194174757281553, "grad_norm": 3.1988773345947266, "learning_rate": 3.58e-05, "loss": 3.3565, "step": 180 }, { "epoch": 7.233009708737864, "grad_norm": 3.6124112606048584, "learning_rate": 3.6e-05, "loss": 3.3662, "step": 181 }, { "epoch": 7.271844660194175, "grad_norm": 2.836766242980957, "learning_rate": 3.62e-05, "loss": 3.2874, "step": 182 }, { "epoch": 7.310679611650485, "grad_norm": 3.2610206604003906, "learning_rate": 3.6400000000000004e-05, "loss": 3.2984, "step": 183 }, { "epoch": 7.349514563106796, "grad_norm": 2.8655757904052734, "learning_rate": 3.66e-05, "loss": 3.2584, "step": 184 }, { "epoch": 7.388349514563107, "grad_norm": 3.4718809127807617, "learning_rate": 3.68e-05, "loss": 3.2489, "step": 185 }, { "epoch": 7.427184466019417, "grad_norm": 3.2131571769714355, "learning_rate": 3.7e-05, "loss": 3.3321, "step": 186 }, { "epoch": 7.466019417475728, "grad_norm": 3.1714115142822266, "learning_rate": 3.72e-05, "loss": 3.3489, "step": 187 }, { "epoch": 7.504854368932039, "grad_norm": 2.877065658569336, "learning_rate": 3.74e-05, "loss": 3.245, "step": 188 }, { "epoch": 7.543689320388349, "grad_norm": 3.1105806827545166, "learning_rate": 3.76e-05, "loss": 3.272, "step": 189 }, { "epoch": 7.58252427184466, "grad_norm": 3.5332155227661133, "learning_rate": 3.7800000000000004e-05, "loss": 3.3132, "step": 190 }, { "epoch": 7.621359223300971, "grad_norm": 2.8226609230041504, "learning_rate": 3.8e-05, "loss": 3.2721, "step": 191 }, { "epoch": 7.660194174757281, "grad_norm": 2.5367422103881836, "learning_rate": 3.82e-05, "loss": 3.3234, "step": 192 }, { "epoch": 7.699029126213592, "grad_norm": 2.9826626777648926, "learning_rate": 3.8400000000000005e-05, "loss": 3.2643, "step": 193 }, { "epoch": 7.737864077669903, "grad_norm": 3.456496477127075, "learning_rate": 3.86e-05, "loss": 3.2105, "step": 194 }, { "epoch": 7.776699029126213, "grad_norm": 3.286680221557617, "learning_rate": 3.88e-05, "loss": 3.2156, "step": 195 }, { "epoch": 7.815533980582524, "grad_norm": 2.996983528137207, "learning_rate": 3.9000000000000006e-05, "loss": 3.3637, "step": 196 }, { "epoch": 7.854368932038835, "grad_norm": 3.129873037338257, "learning_rate": 3.9200000000000004e-05, "loss": 3.2444, "step": 197 }, { "epoch": 7.893203883495145, "grad_norm": 2.591716766357422, "learning_rate": 3.94e-05, "loss": 3.2831, "step": 198 }, { "epoch": 7.932038834951456, "grad_norm": 2.664017677307129, "learning_rate": 3.960000000000001e-05, "loss": 3.1692, "step": 199 }, { "epoch": 7.970873786407767, "grad_norm": 2.8941309452056885, "learning_rate": 3.9800000000000005e-05, "loss": 3.2986, "step": 200 }, { "epoch": 7.970873786407767, "eval_loss": 3.3523428440093994, "eval_runtime": 0.9896, "eval_samples_per_second": 22.23, "eval_steps_per_second": 6.063, "step": 200 }, { "epoch": 8.03883495145631, "grad_norm": 6.677456378936768, "learning_rate": 4e-05, "loss": 6.5052, "step": 201 }, { "epoch": 8.077669902912621, "grad_norm": 3.38222599029541, "learning_rate": 4.02e-05, "loss": 3.1682, "step": 202 }, { "epoch": 8.116504854368932, "grad_norm": 2.9144835472106934, "learning_rate": 4.0400000000000006e-05, "loss": 3.1505, "step": 203 }, { "epoch": 8.155339805825243, "grad_norm": 2.837830066680908, "learning_rate": 4.0600000000000004e-05, "loss": 3.1576, "step": 204 }, { "epoch": 8.194174757281553, "grad_norm": 3.3070290088653564, "learning_rate": 4.08e-05, "loss": 3.1545, "step": 205 }, { "epoch": 8.233009708737864, "grad_norm": 2.6031386852264404, "learning_rate": 4.1e-05, "loss": 3.1598, "step": 206 }, { "epoch": 8.271844660194175, "grad_norm": 2.879425525665283, "learning_rate": 4.12e-05, "loss": 3.1222, "step": 207 }, { "epoch": 8.310679611650485, "grad_norm": 3.14932918548584, "learning_rate": 4.14e-05, "loss": 3.0743, "step": 208 }, { "epoch": 8.349514563106796, "grad_norm": 3.3993191719055176, "learning_rate": 4.16e-05, "loss": 3.1589, "step": 209 }, { "epoch": 8.388349514563107, "grad_norm": 3.2141942977905273, "learning_rate": 4.18e-05, "loss": 3.0428, "step": 210 }, { "epoch": 8.427184466019417, "grad_norm": 2.791717290878296, "learning_rate": 4.2e-05, "loss": 3.1158, "step": 211 }, { "epoch": 8.466019417475728, "grad_norm": 3.1668970584869385, "learning_rate": 4.22e-05, "loss": 3.0463, "step": 212 }, { "epoch": 8.504854368932039, "grad_norm": 2.4356696605682373, "learning_rate": 4.24e-05, "loss": 3.1434, "step": 213 }, { "epoch": 8.54368932038835, "grad_norm": 2.9241132736206055, "learning_rate": 4.26e-05, "loss": 3.0292, "step": 214 }, { "epoch": 8.58252427184466, "grad_norm": 2.4170773029327393, "learning_rate": 4.2800000000000004e-05, "loss": 3.0923, "step": 215 }, { "epoch": 8.62135922330097, "grad_norm": 2.4428963661193848, "learning_rate": 4.3e-05, "loss": 3.0588, "step": 216 }, { "epoch": 8.660194174757281, "grad_norm": 3.0066943168640137, "learning_rate": 4.32e-05, "loss": 3.0815, "step": 217 }, { "epoch": 8.699029126213592, "grad_norm": 3.0532405376434326, "learning_rate": 4.3400000000000005e-05, "loss": 3.1377, "step": 218 }, { "epoch": 8.737864077669903, "grad_norm": 2.9405910968780518, "learning_rate": 4.36e-05, "loss": 3.081, "step": 219 }, { "epoch": 8.776699029126213, "grad_norm": 2.82438325881958, "learning_rate": 4.38e-05, "loss": 3.0538, "step": 220 }, { "epoch": 8.815533980582524, "grad_norm": 2.899946928024292, "learning_rate": 4.4000000000000006e-05, "loss": 3.0664, "step": 221 }, { "epoch": 8.854368932038835, "grad_norm": 2.4132299423217773, "learning_rate": 4.4200000000000004e-05, "loss": 3.0723, "step": 222 }, { "epoch": 8.893203883495145, "grad_norm": 3.2833642959594727, "learning_rate": 4.44e-05, "loss": 3.0445, "step": 223 }, { "epoch": 8.932038834951456, "grad_norm": 2.60457706451416, "learning_rate": 4.46e-05, "loss": 3.0134, "step": 224 }, { "epoch": 8.970873786407767, "grad_norm": 2.7552649974823, "learning_rate": 4.4800000000000005e-05, "loss": 3.0772, "step": 225 }, { "epoch": 8.970873786407767, "eval_loss": 3.2081830501556396, "eval_runtime": 0.971, "eval_samples_per_second": 22.657, "eval_steps_per_second": 6.179, "step": 225 }, { "epoch": 9.03883495145631, "grad_norm": 6.834669589996338, "learning_rate": 4.5e-05, "loss": 6.0, "step": 226 }, { "epoch": 9.077669902912621, "grad_norm": 2.7431795597076416, "learning_rate": 4.52e-05, "loss": 2.8938, "step": 227 }, { "epoch": 9.116504854368932, "grad_norm": 3.5260982513427734, "learning_rate": 4.5400000000000006e-05, "loss": 2.955, "step": 228 }, { "epoch": 9.155339805825243, "grad_norm": 2.705111026763916, "learning_rate": 4.5600000000000004e-05, "loss": 2.9586, "step": 229 }, { "epoch": 9.194174757281553, "grad_norm": 3.2462103366851807, "learning_rate": 4.58e-05, "loss": 2.8973, "step": 230 }, { "epoch": 9.233009708737864, "grad_norm": 2.788363218307495, "learning_rate": 4.600000000000001e-05, "loss": 2.9328, "step": 231 }, { "epoch": 9.271844660194175, "grad_norm": 2.4145243167877197, "learning_rate": 4.6200000000000005e-05, "loss": 2.9653, "step": 232 }, { "epoch": 9.310679611650485, "grad_norm": 3.2271153926849365, "learning_rate": 4.64e-05, "loss": 2.922, "step": 233 }, { "epoch": 9.349514563106796, "grad_norm": 2.3625218868255615, "learning_rate": 4.660000000000001e-05, "loss": 3.0413, "step": 234 }, { "epoch": 9.388349514563107, "grad_norm": 3.17262864112854, "learning_rate": 4.6800000000000006e-05, "loss": 2.9962, "step": 235 }, { "epoch": 9.427184466019417, "grad_norm": 2.906003475189209, "learning_rate": 4.7e-05, "loss": 2.9422, "step": 236 }, { "epoch": 9.466019417475728, "grad_norm": 2.1498398780822754, "learning_rate": 4.72e-05, "loss": 2.9061, "step": 237 }, { "epoch": 9.504854368932039, "grad_norm": 2.9519286155700684, "learning_rate": 4.74e-05, "loss": 2.967, "step": 238 }, { "epoch": 9.54368932038835, "grad_norm": 2.561063528060913, "learning_rate": 4.76e-05, "loss": 2.9191, "step": 239 }, { "epoch": 9.58252427184466, "grad_norm": 3.8291261196136475, "learning_rate": 4.78e-05, "loss": 2.9071, "step": 240 }, { "epoch": 9.62135922330097, "grad_norm": 3.4280309677124023, "learning_rate": 4.8e-05, "loss": 2.9384, "step": 241 }, { "epoch": 9.660194174757281, "grad_norm": 3.460054397583008, "learning_rate": 4.82e-05, "loss": 2.9387, "step": 242 }, { "epoch": 9.699029126213592, "grad_norm": 3.3750805854797363, "learning_rate": 4.8400000000000004e-05, "loss": 2.9552, "step": 243 }, { "epoch": 9.737864077669903, "grad_norm": 2.6689562797546387, "learning_rate": 4.86e-05, "loss": 2.8809, "step": 244 }, { "epoch": 9.776699029126213, "grad_norm": 2.9314560890197754, "learning_rate": 4.88e-05, "loss": 2.7902, "step": 245 }, { "epoch": 9.815533980582524, "grad_norm": 2.630530595779419, "learning_rate": 4.9e-05, "loss": 2.8857, "step": 246 }, { "epoch": 9.854368932038835, "grad_norm": 2.546659231185913, "learning_rate": 4.92e-05, "loss": 2.8896, "step": 247 }, { "epoch": 9.893203883495145, "grad_norm": 2.795778751373291, "learning_rate": 4.94e-05, "loss": 2.9516, "step": 248 }, { "epoch": 9.932038834951456, "grad_norm": 3.0504794120788574, "learning_rate": 4.96e-05, "loss": 2.9132, "step": 249 }, { "epoch": 9.970873786407767, "grad_norm": 3.444287061691284, "learning_rate": 4.9800000000000004e-05, "loss": 2.973, "step": 250 }, { "epoch": 9.970873786407767, "eval_loss": 3.089428186416626, "eval_runtime": 0.9754, "eval_samples_per_second": 22.555, "eval_steps_per_second": 6.151, "step": 250 }, { "epoch": 10.03883495145631, "grad_norm": 5.480017185211182, "learning_rate": 5e-05, "loss": 5.6196, "step": 251 }, { "epoch": 10.077669902912621, "grad_norm": 3.3957669734954834, "learning_rate": 4.999997563061038e-05, "loss": 2.8152, "step": 252 }, { "epoch": 10.116504854368932, "grad_norm": 2.6747496128082275, "learning_rate": 4.9999902522489015e-05, "loss": 2.8624, "step": 253 }, { "epoch": 10.155339805825243, "grad_norm": 3.2186131477355957, "learning_rate": 4.999978067577844e-05, "loss": 2.7587, "step": 254 }, { "epoch": 10.194174757281553, "grad_norm": 3.7385358810424805, "learning_rate": 4.999961009071621e-05, "loss": 2.8117, "step": 255 }, { "epoch": 10.233009708737864, "grad_norm": 2.586005926132202, "learning_rate": 4.999939076763487e-05, "loss": 2.7617, "step": 256 }, { "epoch": 10.271844660194175, "grad_norm": 2.7468533515930176, "learning_rate": 4.999912270696202e-05, "loss": 2.802, "step": 257 }, { "epoch": 10.310679611650485, "grad_norm": 2.7268691062927246, "learning_rate": 4.999880590922025e-05, "loss": 2.7928, "step": 258 }, { "epoch": 10.349514563106796, "grad_norm": 2.6305949687957764, "learning_rate": 4.9998440375027166e-05, "loss": 2.8245, "step": 259 }, { "epoch": 10.388349514563107, "grad_norm": 2.8977084159851074, "learning_rate": 4.9998026105095405e-05, "loss": 2.7525, "step": 260 }, { "epoch": 10.427184466019417, "grad_norm": 2.394578218460083, "learning_rate": 4.999756310023261e-05, "loss": 2.731, "step": 261 }, { "epoch": 10.466019417475728, "grad_norm": 3.0859174728393555, "learning_rate": 4.9997051361341425e-05, "loss": 2.7902, "step": 262 }, { "epoch": 10.504854368932039, "grad_norm": 2.929978370666504, "learning_rate": 4.9996490889419514e-05, "loss": 2.7723, "step": 263 }, { "epoch": 10.54368932038835, "grad_norm": 2.6215100288391113, "learning_rate": 4.999588168555954e-05, "loss": 2.7892, "step": 264 }, { "epoch": 10.58252427184466, "grad_norm": 2.744954824447632, "learning_rate": 4.999522375094919e-05, "loss": 2.8024, "step": 265 }, { "epoch": 10.62135922330097, "grad_norm": 2.775912046432495, "learning_rate": 4.999451708687114e-05, "loss": 2.642, "step": 266 }, { "epoch": 10.660194174757281, "grad_norm": 2.5821340084075928, "learning_rate": 4.999376169470306e-05, "loss": 2.7808, "step": 267 }, { "epoch": 10.699029126213592, "grad_norm": 2.4101083278656006, "learning_rate": 4.999295757591762e-05, "loss": 2.7318, "step": 268 }, { "epoch": 10.737864077669903, "grad_norm": 2.4816181659698486, "learning_rate": 4.99921047320825e-05, "loss": 2.7707, "step": 269 }, { "epoch": 10.776699029126213, "grad_norm": 2.366009473800659, "learning_rate": 4.9991203164860365e-05, "loss": 2.7481, "step": 270 }, { "epoch": 10.815533980582524, "grad_norm": 2.9792630672454834, "learning_rate": 4.999025287600886e-05, "loss": 2.7204, "step": 271 }, { "epoch": 10.854368932038835, "grad_norm": 3.0781967639923096, "learning_rate": 4.998925386738063e-05, "loss": 2.7248, "step": 272 }, { "epoch": 10.893203883495145, "grad_norm": 2.6866307258605957, "learning_rate": 4.998820614092328e-05, "loss": 2.7456, "step": 273 }, { "epoch": 10.932038834951456, "grad_norm": 2.789808988571167, "learning_rate": 4.998710969867942e-05, "loss": 2.7224, "step": 274 }, { "epoch": 10.970873786407767, "grad_norm": 2.4948067665100098, "learning_rate": 4.9985964542786614e-05, "loss": 2.6724, "step": 275 }, { "epoch": 10.970873786407767, "eval_loss": 2.9974570274353027, "eval_runtime": 0.9771, "eval_samples_per_second": 22.516, "eval_steps_per_second": 6.141, "step": 275 }, { "epoch": 11.03883495145631, "grad_norm": 4.32741117477417, "learning_rate": 4.99847706754774e-05, "loss": 5.4423, "step": 276 }, { "epoch": 11.077669902912621, "grad_norm": 2.3572208881378174, "learning_rate": 4.998352809907928e-05, "loss": 2.6372, "step": 277 }, { "epoch": 11.116504854368932, "grad_norm": 2.4045934677124023, "learning_rate": 4.998223681601473e-05, "loss": 2.6205, "step": 278 }, { "epoch": 11.155339805825243, "grad_norm": 2.5755131244659424, "learning_rate": 4.998089682880117e-05, "loss": 2.5939, "step": 279 }, { "epoch": 11.194174757281553, "grad_norm": 2.5768463611602783, "learning_rate": 4.997950814005098e-05, "loss": 2.6925, "step": 280 }, { "epoch": 11.233009708737864, "grad_norm": 2.5549166202545166, "learning_rate": 4.997807075247146e-05, "loss": 2.6172, "step": 281 }, { "epoch": 11.271844660194175, "grad_norm": 2.761068344116211, "learning_rate": 4.997658466886489e-05, "loss": 2.6572, "step": 282 }, { "epoch": 11.310679611650485, "grad_norm": 2.5051231384277344, "learning_rate": 4.9975049892128455e-05, "loss": 2.6549, "step": 283 }, { "epoch": 11.349514563106796, "grad_norm": 2.7434117794036865, "learning_rate": 4.9973466425254286e-05, "loss": 2.5632, "step": 284 }, { "epoch": 11.388349514563107, "grad_norm": 2.328563928604126, "learning_rate": 4.997183427132943e-05, "loss": 2.5751, "step": 285 }, { "epoch": 11.427184466019417, "grad_norm": 2.7668466567993164, "learning_rate": 4.997015343353585e-05, "loss": 2.6609, "step": 286 }, { "epoch": 11.466019417475728, "grad_norm": 2.0831525325775146, "learning_rate": 4.996842391515044e-05, "loss": 2.6428, "step": 287 }, { "epoch": 11.504854368932039, "grad_norm": 2.4443278312683105, "learning_rate": 4.996664571954497e-05, "loss": 2.6012, "step": 288 }, { "epoch": 11.54368932038835, "grad_norm": 2.4806153774261475, "learning_rate": 4.9964818850186135e-05, "loss": 2.6649, "step": 289 }, { "epoch": 11.58252427184466, "grad_norm": 2.539933919906616, "learning_rate": 4.99629433106355e-05, "loss": 2.6253, "step": 290 }, { "epoch": 11.62135922330097, "grad_norm": 2.7404544353485107, "learning_rate": 4.996101910454953e-05, "loss": 2.6224, "step": 291 }, { "epoch": 11.660194174757281, "grad_norm": 2.5377357006073, "learning_rate": 4.9959046235679565e-05, "loss": 2.6249, "step": 292 }, { "epoch": 11.699029126213592, "grad_norm": 2.8488271236419678, "learning_rate": 4.9957024707871806e-05, "loss": 2.6232, "step": 293 }, { "epoch": 11.737864077669903, "grad_norm": 2.4895827770233154, "learning_rate": 4.9954954525067334e-05, "loss": 2.5983, "step": 294 }, { "epoch": 11.776699029126213, "grad_norm": 3.038975954055786, "learning_rate": 4.995283569130207e-05, "loss": 2.5715, "step": 295 }, { "epoch": 11.815533980582524, "grad_norm": 2.674245595932007, "learning_rate": 4.995066821070679e-05, "loss": 2.6201, "step": 296 }, { "epoch": 11.854368932038835, "grad_norm": 3.5277645587921143, "learning_rate": 4.9948452087507116e-05, "loss": 2.6376, "step": 297 }, { "epoch": 11.893203883495145, "grad_norm": 3.0974984169006348, "learning_rate": 4.994618732602349e-05, "loss": 2.6268, "step": 298 }, { "epoch": 11.932038834951456, "grad_norm": 2.309119462966919, "learning_rate": 4.994387393067117e-05, "loss": 2.5594, "step": 299 }, { "epoch": 11.970873786407767, "grad_norm": 2.540464162826538, "learning_rate": 4.994151190596025e-05, "loss": 2.5765, "step": 300 }, { "epoch": 11.970873786407767, "eval_loss": 2.9208481311798096, "eval_runtime": 1.0115, "eval_samples_per_second": 21.749, "eval_steps_per_second": 5.932, "step": 300 }, { "epoch": 12.03883495145631, "grad_norm": 5.542501449584961, "learning_rate": 4.993910125649561e-05, "loss": 5.1943, "step": 301 }, { "epoch": 12.077669902912621, "grad_norm": 2.2998414039611816, "learning_rate": 4.993664198697694e-05, "loss": 2.5311, "step": 302 }, { "epoch": 12.116504854368932, "grad_norm": 3.0827107429504395, "learning_rate": 4.993413410219871e-05, "loss": 2.5587, "step": 303 }, { "epoch": 12.155339805825243, "grad_norm": 2.7742204666137695, "learning_rate": 4.9931577607050175e-05, "loss": 2.4549, "step": 304 }, { "epoch": 12.194174757281553, "grad_norm": 2.5605695247650146, "learning_rate": 4.992897250651535e-05, "loss": 2.5602, "step": 305 }, { "epoch": 12.233009708737864, "grad_norm": 2.8852667808532715, "learning_rate": 4.992631880567301e-05, "loss": 2.5069, "step": 306 }, { "epoch": 12.271844660194175, "grad_norm": 3.006777048110962, "learning_rate": 4.9923616509696683e-05, "loss": 2.5326, "step": 307 }, { "epoch": 12.310679611650485, "grad_norm": 2.1645665168762207, "learning_rate": 4.9920865623854615e-05, "loss": 2.4739, "step": 308 }, { "epoch": 12.349514563106796, "grad_norm": 2.941042423248291, "learning_rate": 4.9918066153509834e-05, "loss": 2.5149, "step": 309 }, { "epoch": 12.388349514563107, "grad_norm": 2.598097562789917, "learning_rate": 4.991521810412002e-05, "loss": 2.5214, "step": 310 }, { "epoch": 12.427184466019417, "grad_norm": 2.408721446990967, "learning_rate": 4.991232148123761e-05, "loss": 2.4747, "step": 311 }, { "epoch": 12.466019417475728, "grad_norm": 2.39508318901062, "learning_rate": 4.990937629050971e-05, "loss": 2.5304, "step": 312 }, { "epoch": 12.504854368932039, "grad_norm": 2.9436190128326416, "learning_rate": 4.990638253767812e-05, "loss": 2.5046, "step": 313 }, { "epoch": 12.54368932038835, "grad_norm": 2.6037611961364746, "learning_rate": 4.990334022857932e-05, "loss": 2.4537, "step": 314 }, { "epoch": 12.58252427184466, "grad_norm": 2.892789602279663, "learning_rate": 4.9900249369144434e-05, "loss": 2.4817, "step": 315 }, { "epoch": 12.62135922330097, "grad_norm": 2.6804611682891846, "learning_rate": 4.989710996539926e-05, "loss": 2.5012, "step": 316 }, { "epoch": 12.660194174757281, "grad_norm": 2.458824396133423, "learning_rate": 4.9893922023464236e-05, "loss": 2.4661, "step": 317 }, { "epoch": 12.699029126213592, "grad_norm": 2.6641952991485596, "learning_rate": 4.989068554955439e-05, "loss": 2.4971, "step": 318 }, { "epoch": 12.737864077669903, "grad_norm": 2.421142101287842, "learning_rate": 4.988740054997943e-05, "loss": 2.4014, "step": 319 }, { "epoch": 12.776699029126213, "grad_norm": 2.4107542037963867, "learning_rate": 4.98840670311436e-05, "loss": 2.4636, "step": 320 }, { "epoch": 12.815533980582524, "grad_norm": 2.5701303482055664, "learning_rate": 4.988068499954578e-05, "loss": 2.4564, "step": 321 }, { "epoch": 12.854368932038835, "grad_norm": 2.3998067378997803, "learning_rate": 4.987725446177941e-05, "loss": 2.4561, "step": 322 }, { "epoch": 12.893203883495145, "grad_norm": 2.6888773441314697, "learning_rate": 4.987377542453251e-05, "loss": 2.4392, "step": 323 }, { "epoch": 12.932038834951456, "grad_norm": 2.313508987426758, "learning_rate": 4.987024789458762e-05, "loss": 2.4438, "step": 324 }, { "epoch": 12.970873786407767, "grad_norm": 2.5614566802978516, "learning_rate": 4.986667187882186e-05, "loss": 2.557, "step": 325 }, { "epoch": 12.970873786407767, "eval_loss": 2.8833444118499756, "eval_runtime": 1.0479, "eval_samples_per_second": 20.995, "eval_steps_per_second": 5.726, "step": 325 }, { "epoch": 13.03883495145631, "grad_norm": 5.110360145568848, "learning_rate": 4.9863047384206835e-05, "loss": 4.9144, "step": 326 }, { "epoch": 13.077669902912621, "grad_norm": 2.7373085021972656, "learning_rate": 4.98593744178087e-05, "loss": 2.3994, "step": 327 }, { "epoch": 13.116504854368932, "grad_norm": 2.542954206466675, "learning_rate": 4.985565298678809e-05, "loss": 2.3535, "step": 328 }, { "epoch": 13.155339805825243, "grad_norm": 2.6374223232269287, "learning_rate": 4.985188309840012e-05, "loss": 2.3894, "step": 329 }, { "epoch": 13.194174757281553, "grad_norm": 2.541004180908203, "learning_rate": 4.984806475999437e-05, "loss": 2.391, "step": 330 }, { "epoch": 13.233009708737864, "grad_norm": 2.6150271892547607, "learning_rate": 4.984419797901491e-05, "loss": 2.3927, "step": 331 }, { "epoch": 13.271844660194175, "grad_norm": 2.47719144821167, "learning_rate": 4.984028276300021e-05, "loss": 2.3751, "step": 332 }, { "epoch": 13.310679611650485, "grad_norm": 2.679882764816284, "learning_rate": 4.983631911958319e-05, "loss": 2.374, "step": 333 }, { "epoch": 13.349514563106796, "grad_norm": 2.784619092941284, "learning_rate": 4.983230705649118e-05, "loss": 2.3539, "step": 334 }, { "epoch": 13.388349514563107, "grad_norm": 2.188197135925293, "learning_rate": 4.982824658154589e-05, "loss": 2.3553, "step": 335 }, { "epoch": 13.427184466019417, "grad_norm": 2.232978582382202, "learning_rate": 4.982413770266342e-05, "loss": 2.3389, "step": 336 }, { "epoch": 13.466019417475728, "grad_norm": 2.563889980316162, "learning_rate": 4.981998042785427e-05, "loss": 2.3623, "step": 337 }, { "epoch": 13.504854368932039, "grad_norm": 2.9053828716278076, "learning_rate": 4.9815774765223226e-05, "loss": 2.3705, "step": 338 }, { "epoch": 13.54368932038835, "grad_norm": 2.5447866916656494, "learning_rate": 4.9811520722969465e-05, "loss": 2.3216, "step": 339 }, { "epoch": 13.58252427184466, "grad_norm": 3.22255277633667, "learning_rate": 4.9807218309386444e-05, "loss": 2.3418, "step": 340 }, { "epoch": 13.62135922330097, "grad_norm": 3.154477119445801, "learning_rate": 4.980286753286195e-05, "loss": 2.3843, "step": 341 }, { "epoch": 13.660194174757281, "grad_norm": 3.3448827266693115, "learning_rate": 4.979846840187804e-05, "loss": 2.419, "step": 342 }, { "epoch": 13.699029126213592, "grad_norm": 3.275527238845825, "learning_rate": 4.9794020925011044e-05, "loss": 2.3756, "step": 343 }, { "epoch": 13.737864077669903, "grad_norm": 2.3320887088775635, "learning_rate": 4.9789525110931545e-05, "loss": 2.3201, "step": 344 }, { "epoch": 13.776699029126213, "grad_norm": 2.804107427597046, "learning_rate": 4.978498096840436e-05, "loss": 2.3461, "step": 345 }, { "epoch": 13.815533980582524, "grad_norm": 2.809633255004883, "learning_rate": 4.978038850628854e-05, "loss": 2.3418, "step": 346 }, { "epoch": 13.854368932038835, "grad_norm": 2.9983737468719482, "learning_rate": 4.977574773353732e-05, "loss": 2.4238, "step": 347 }, { "epoch": 13.893203883495145, "grad_norm": 2.892005443572998, "learning_rate": 4.977105865919812e-05, "loss": 2.4266, "step": 348 }, { "epoch": 13.932038834951456, "grad_norm": 2.766019821166992, "learning_rate": 4.976632129241252e-05, "loss": 2.3937, "step": 349 }, { "epoch": 13.970873786407767, "grad_norm": 2.5251376628875732, "learning_rate": 4.976153564241628e-05, "loss": 2.3557, "step": 350 }, { "epoch": 13.970873786407767, "eval_loss": 2.855170965194702, "eval_runtime": 1.0307, "eval_samples_per_second": 21.345, "eval_steps_per_second": 5.821, "step": 350 }, { "epoch": 14.03883495145631, "grad_norm": 5.263445854187012, "learning_rate": 4.975670171853926e-05, "loss": 4.6103, "step": 351 }, { "epoch": 14.077669902912621, "grad_norm": 2.6694159507751465, "learning_rate": 4.975181953020544e-05, "loss": 2.2714, "step": 352 }, { "epoch": 14.116504854368932, "grad_norm": 3.4369680881500244, "learning_rate": 4.9746889086932895e-05, "loss": 2.2303, "step": 353 }, { "epoch": 14.155339805825243, "grad_norm": 3.053704023361206, "learning_rate": 4.974191039833378e-05, "loss": 2.2659, "step": 354 }, { "epoch": 14.194174757281553, "grad_norm": 2.9966983795166016, "learning_rate": 4.973688347411431e-05, "loss": 2.3092, "step": 355 }, { "epoch": 14.233009708737864, "grad_norm": 2.965481758117676, "learning_rate": 4.9731808324074717e-05, "loss": 2.2537, "step": 356 }, { "epoch": 14.271844660194175, "grad_norm": 2.9761455059051514, "learning_rate": 4.9726684958109266e-05, "loss": 2.2865, "step": 357 }, { "epoch": 14.310679611650485, "grad_norm": 2.936624050140381, "learning_rate": 4.972151338620623e-05, "loss": 2.2589, "step": 358 }, { "epoch": 14.349514563106796, "grad_norm": 3.4442408084869385, "learning_rate": 4.971629361844785e-05, "loss": 2.2636, "step": 359 }, { "epoch": 14.388349514563107, "grad_norm": 3.0097110271453857, "learning_rate": 4.971102566501034e-05, "loss": 2.204, "step": 360 }, { "epoch": 14.427184466019417, "grad_norm": 3.7276322841644287, "learning_rate": 4.9705709536163824e-05, "loss": 2.2811, "step": 361 }, { "epoch": 14.466019417475728, "grad_norm": 2.8004868030548096, "learning_rate": 4.970034524227238e-05, "loss": 2.1964, "step": 362 }, { "epoch": 14.504854368932039, "grad_norm": 3.1439263820648193, "learning_rate": 4.969493279379398e-05, "loss": 2.294, "step": 363 }, { "epoch": 14.54368932038835, "grad_norm": 2.971735954284668, "learning_rate": 4.968947220128045e-05, "loss": 2.2882, "step": 364 }, { "epoch": 14.58252427184466, "grad_norm": 2.860797166824341, "learning_rate": 4.968396347537751e-05, "loss": 2.1807, "step": 365 }, { "epoch": 14.62135922330097, "grad_norm": 2.8869500160217285, "learning_rate": 4.96784066268247e-05, "loss": 2.267, "step": 366 }, { "epoch": 14.660194174757281, "grad_norm": 3.185670852661133, "learning_rate": 4.967280166645538e-05, "loss": 2.2956, "step": 367 }, { "epoch": 14.699029126213592, "grad_norm": 2.750898838043213, "learning_rate": 4.96671486051967e-05, "loss": 2.2429, "step": 368 }, { "epoch": 14.737864077669903, "grad_norm": 2.690889596939087, "learning_rate": 4.966144745406961e-05, "loss": 2.2645, "step": 369 }, { "epoch": 14.776699029126213, "grad_norm": 2.5257797241210938, "learning_rate": 4.965569822418877e-05, "loss": 2.1714, "step": 370 }, { "epoch": 14.815533980582524, "grad_norm": 2.550966739654541, "learning_rate": 4.964990092676263e-05, "loss": 2.2281, "step": 371 }, { "epoch": 14.854368932038835, "grad_norm": 2.6299831867218018, "learning_rate": 4.964405557309328e-05, "loss": 2.2925, "step": 372 }, { "epoch": 14.893203883495145, "grad_norm": 2.8115315437316895, "learning_rate": 4.963816217457657e-05, "loss": 2.3404, "step": 373 }, { "epoch": 14.932038834951456, "grad_norm": 2.646278142929077, "learning_rate": 4.9632220742701965e-05, "loss": 2.2326, "step": 374 }, { "epoch": 14.970873786407767, "grad_norm": 2.667069435119629, "learning_rate": 4.9626231289052596e-05, "loss": 2.318, "step": 375 }, { "epoch": 14.970873786407767, "eval_loss": 2.8448235988616943, "eval_runtime": 0.9921, "eval_samples_per_second": 22.176, "eval_steps_per_second": 6.048, "step": 375 } ], "logging_steps": 1, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3395309036544000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }