| { |
| "best_global_step": 375, |
| "best_metric": 2.8448235988616943, |
| "best_model_checkpoint": "outputs/checkpoint-375", |
| "epoch": 14.970873786407767, |
| "eval_steps": 500, |
| "global_step": 375, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.038834951456310676, |
| "grad_norm": 21.30242156982422, |
| "learning_rate": 0.0, |
| "loss": 6.5474, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.07766990291262135, |
| "grad_norm": 20.775470733642578, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 6.5613, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.11650485436893204, |
| "grad_norm": 20.96541976928711, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 6.5127, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.1553398058252427, |
| "grad_norm": 20.376543045043945, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 6.4569, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.1941747572815534, |
| "grad_norm": 19.54267692565918, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 6.3743, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.23300970873786409, |
| "grad_norm": 19.233882904052734, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 6.3899, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.27184466019417475, |
| "grad_norm": 20.25909423828125, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 6.4415, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.3106796116504854, |
| "grad_norm": 19.33000373840332, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 6.3191, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.34951456310679613, |
| "grad_norm": 18.305322647094727, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 6.2681, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.3883495145631068, |
| "grad_norm": 17.74665069580078, |
| "learning_rate": 1.8e-06, |
| "loss": 6.4206, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.42718446601941745, |
| "grad_norm": 14.93736457824707, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 6.2288, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.46601941747572817, |
| "grad_norm": 14.914277076721191, |
| "learning_rate": 2.2e-06, |
| "loss": 6.3961, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.5048543689320388, |
| "grad_norm": 13.266161918640137, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 6.0076, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.5436893203883495, |
| "grad_norm": 12.377790451049805, |
| "learning_rate": 2.6e-06, |
| "loss": 6.2259, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.5825242718446602, |
| "grad_norm": 11.322343826293945, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 6.1832, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.6213592233009708, |
| "grad_norm": 10.584484100341797, |
| "learning_rate": 3e-06, |
| "loss": 6.0051, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.6601941747572816, |
| "grad_norm": 10.82979965209961, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 5.9786, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.6990291262135923, |
| "grad_norm": 10.112428665161133, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 5.9778, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.7378640776699029, |
| "grad_norm": 9.44952392578125, |
| "learning_rate": 3.6e-06, |
| "loss": 5.9459, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.7766990291262136, |
| "grad_norm": 9.057659149169922, |
| "learning_rate": 3.8e-06, |
| "loss": 6.0317, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.8155339805825242, |
| "grad_norm": 9.000926971435547, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 5.9749, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.8543689320388349, |
| "grad_norm": 7.747213840484619, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 5.8036, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.8932038834951457, |
| "grad_norm": 6.968072891235352, |
| "learning_rate": 4.4e-06, |
| "loss": 5.7705, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.9320388349514563, |
| "grad_norm": 7.167684555053711, |
| "learning_rate": 4.6e-06, |
| "loss": 5.7804, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 6.384294033050537, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 5.6137, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "eval_loss": 5.6718363761901855, |
| "eval_runtime": 2.435, |
| "eval_samples_per_second": 9.035, |
| "eval_steps_per_second": 2.464, |
| "step": 25 |
| }, |
| { |
| "epoch": 1.0388349514563107, |
| "grad_norm": 12.656047821044922, |
| "learning_rate": 5e-06, |
| "loss": 11.4741, |
| "step": 26 |
| }, |
| { |
| "epoch": 1.0776699029126213, |
| "grad_norm": 6.408062934875488, |
| "learning_rate": 5.2e-06, |
| "loss": 5.7835, |
| "step": 27 |
| }, |
| { |
| "epoch": 1.116504854368932, |
| "grad_norm": 6.457642078399658, |
| "learning_rate": 5.4e-06, |
| "loss": 5.7659, |
| "step": 28 |
| }, |
| { |
| "epoch": 1.1553398058252426, |
| "grad_norm": 6.716769218444824, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 5.6133, |
| "step": 29 |
| }, |
| { |
| "epoch": 1.1941747572815533, |
| "grad_norm": 5.562079906463623, |
| "learning_rate": 5.8e-06, |
| "loss": 5.6588, |
| "step": 30 |
| }, |
| { |
| "epoch": 1.233009708737864, |
| "grad_norm": 5.209117412567139, |
| "learning_rate": 6e-06, |
| "loss": 5.6118, |
| "step": 31 |
| }, |
| { |
| "epoch": 1.2718446601941746, |
| "grad_norm": 5.505391597747803, |
| "learning_rate": 6.2e-06, |
| "loss": 5.6468, |
| "step": 32 |
| }, |
| { |
| "epoch": 1.3106796116504853, |
| "grad_norm": 4.989831924438477, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 5.5483, |
| "step": 33 |
| }, |
| { |
| "epoch": 1.3495145631067962, |
| "grad_norm": 5.000854015350342, |
| "learning_rate": 6.6e-06, |
| "loss": 5.4522, |
| "step": 34 |
| }, |
| { |
| "epoch": 1.3883495145631068, |
| "grad_norm": 4.343570232391357, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 5.3562, |
| "step": 35 |
| }, |
| { |
| "epoch": 1.4271844660194175, |
| "grad_norm": 4.40326452255249, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 5.4561, |
| "step": 36 |
| }, |
| { |
| "epoch": 1.4660194174757282, |
| "grad_norm": 4.1591901779174805, |
| "learning_rate": 7.2e-06, |
| "loss": 5.3806, |
| "step": 37 |
| }, |
| { |
| "epoch": 1.5048543689320388, |
| "grad_norm": 4.1347246170043945, |
| "learning_rate": 7.4e-06, |
| "loss": 5.419, |
| "step": 38 |
| }, |
| { |
| "epoch": 1.5436893203883495, |
| "grad_norm": 4.123111248016357, |
| "learning_rate": 7.6e-06, |
| "loss": 5.2831, |
| "step": 39 |
| }, |
| { |
| "epoch": 1.5825242718446602, |
| "grad_norm": 4.009028911590576, |
| "learning_rate": 7.8e-06, |
| "loss": 5.332, |
| "step": 40 |
| }, |
| { |
| "epoch": 1.6213592233009708, |
| "grad_norm": 4.013438701629639, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 5.2177, |
| "step": 41 |
| }, |
| { |
| "epoch": 1.6601941747572817, |
| "grad_norm": 3.698003053665161, |
| "learning_rate": 8.200000000000001e-06, |
| "loss": 5.3019, |
| "step": 42 |
| }, |
| { |
| "epoch": 1.6990291262135924, |
| "grad_norm": 3.66217041015625, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 5.1967, |
| "step": 43 |
| }, |
| { |
| "epoch": 1.737864077669903, |
| "grad_norm": 3.455019235610962, |
| "learning_rate": 8.599999999999999e-06, |
| "loss": 5.1573, |
| "step": 44 |
| }, |
| { |
| "epoch": 1.7766990291262137, |
| "grad_norm": 3.5593278408050537, |
| "learning_rate": 8.8e-06, |
| "loss": 5.1463, |
| "step": 45 |
| }, |
| { |
| "epoch": 1.8155339805825244, |
| "grad_norm": 3.332477331161499, |
| "learning_rate": 9e-06, |
| "loss": 5.1732, |
| "step": 46 |
| }, |
| { |
| "epoch": 1.854368932038835, |
| "grad_norm": 3.2428054809570312, |
| "learning_rate": 9.2e-06, |
| "loss": 5.0962, |
| "step": 47 |
| }, |
| { |
| "epoch": 1.8932038834951457, |
| "grad_norm": 3.339063882827759, |
| "learning_rate": 9.4e-06, |
| "loss": 5.0253, |
| "step": 48 |
| }, |
| { |
| "epoch": 1.9320388349514563, |
| "grad_norm": 3.4746124744415283, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 5.1363, |
| "step": 49 |
| }, |
| { |
| "epoch": 1.970873786407767, |
| "grad_norm": 3.371466875076294, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 5.1445, |
| "step": 50 |
| }, |
| { |
| "epoch": 1.970873786407767, |
| "eval_loss": 5.052736282348633, |
| "eval_runtime": 0.9737, |
| "eval_samples_per_second": 22.593, |
| "eval_steps_per_second": 6.162, |
| "step": 50 |
| }, |
| { |
| "epoch": 2.0388349514563107, |
| "grad_norm": 5.6498637199401855, |
| "learning_rate": 1e-05, |
| "loss": 10.112, |
| "step": 51 |
| }, |
| { |
| "epoch": 2.0776699029126213, |
| "grad_norm": 3.1301138401031494, |
| "learning_rate": 1.02e-05, |
| "loss": 5.1063, |
| "step": 52 |
| }, |
| { |
| "epoch": 2.116504854368932, |
| "grad_norm": 3.452958345413208, |
| "learning_rate": 1.04e-05, |
| "loss": 5.0082, |
| "step": 53 |
| }, |
| { |
| "epoch": 2.1553398058252426, |
| "grad_norm": 3.1977169513702393, |
| "learning_rate": 1.06e-05, |
| "loss": 4.9698, |
| "step": 54 |
| }, |
| { |
| "epoch": 2.1941747572815533, |
| "grad_norm": 2.6776535511016846, |
| "learning_rate": 1.08e-05, |
| "loss": 4.9449, |
| "step": 55 |
| }, |
| { |
| "epoch": 2.233009708737864, |
| "grad_norm": 3.5574913024902344, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 5.0442, |
| "step": 56 |
| }, |
| { |
| "epoch": 2.2718446601941746, |
| "grad_norm": 2.867915391921997, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 4.8769, |
| "step": 57 |
| }, |
| { |
| "epoch": 2.3106796116504853, |
| "grad_norm": 2.764223098754883, |
| "learning_rate": 1.1400000000000001e-05, |
| "loss": 4.9286, |
| "step": 58 |
| }, |
| { |
| "epoch": 2.349514563106796, |
| "grad_norm": 3.816723585128784, |
| "learning_rate": 1.16e-05, |
| "loss": 4.8921, |
| "step": 59 |
| }, |
| { |
| "epoch": 2.3883495145631066, |
| "grad_norm": 3.161980152130127, |
| "learning_rate": 1.18e-05, |
| "loss": 4.916, |
| "step": 60 |
| }, |
| { |
| "epoch": 2.4271844660194173, |
| "grad_norm": 2.8373942375183105, |
| "learning_rate": 1.2e-05, |
| "loss": 4.8942, |
| "step": 61 |
| }, |
| { |
| "epoch": 2.466019417475728, |
| "grad_norm": 2.8898000717163086, |
| "learning_rate": 1.22e-05, |
| "loss": 4.8206, |
| "step": 62 |
| }, |
| { |
| "epoch": 2.5048543689320386, |
| "grad_norm": 2.726362943649292, |
| "learning_rate": 1.24e-05, |
| "loss": 4.846, |
| "step": 63 |
| }, |
| { |
| "epoch": 2.5436893203883493, |
| "grad_norm": 2.73665714263916, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 4.8375, |
| "step": 64 |
| }, |
| { |
| "epoch": 2.58252427184466, |
| "grad_norm": 3.1228106021881104, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 4.7526, |
| "step": 65 |
| }, |
| { |
| "epoch": 2.6213592233009706, |
| "grad_norm": 2.9702351093292236, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 4.8024, |
| "step": 66 |
| }, |
| { |
| "epoch": 2.6601941747572817, |
| "grad_norm": 3.0533952713012695, |
| "learning_rate": 1.32e-05, |
| "loss": 4.7883, |
| "step": 67 |
| }, |
| { |
| "epoch": 2.6990291262135924, |
| "grad_norm": 3.1949095726013184, |
| "learning_rate": 1.3400000000000002e-05, |
| "loss": 4.8197, |
| "step": 68 |
| }, |
| { |
| "epoch": 2.737864077669903, |
| "grad_norm": 3.399998426437378, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 4.6677, |
| "step": 69 |
| }, |
| { |
| "epoch": 2.7766990291262137, |
| "grad_norm": 2.80118465423584, |
| "learning_rate": 1.3800000000000002e-05, |
| "loss": 4.6291, |
| "step": 70 |
| }, |
| { |
| "epoch": 2.8155339805825244, |
| "grad_norm": 2.8477330207824707, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 4.7767, |
| "step": 71 |
| }, |
| { |
| "epoch": 2.854368932038835, |
| "grad_norm": 2.6895911693573, |
| "learning_rate": 1.42e-05, |
| "loss": 4.7057, |
| "step": 72 |
| }, |
| { |
| "epoch": 2.8932038834951457, |
| "grad_norm": 2.914586067199707, |
| "learning_rate": 1.44e-05, |
| "loss": 4.6386, |
| "step": 73 |
| }, |
| { |
| "epoch": 2.9320388349514563, |
| "grad_norm": 2.6184370517730713, |
| "learning_rate": 1.4599999999999999e-05, |
| "loss": 4.6679, |
| "step": 74 |
| }, |
| { |
| "epoch": 2.970873786407767, |
| "grad_norm": 3.00891375541687, |
| "learning_rate": 1.48e-05, |
| "loss": 4.6319, |
| "step": 75 |
| }, |
| { |
| "epoch": 2.970873786407767, |
| "eval_loss": 4.614713668823242, |
| "eval_runtime": 0.9702, |
| "eval_samples_per_second": 22.675, |
| "eval_steps_per_second": 6.184, |
| "step": 75 |
| }, |
| { |
| "epoch": 3.0388349514563107, |
| "grad_norm": 5.222214221954346, |
| "learning_rate": 1.5e-05, |
| "loss": 9.2212, |
| "step": 76 |
| }, |
| { |
| "epoch": 3.0776699029126213, |
| "grad_norm": 2.716062307357788, |
| "learning_rate": 1.52e-05, |
| "loss": 4.6294, |
| "step": 77 |
| }, |
| { |
| "epoch": 3.116504854368932, |
| "grad_norm": 2.503143548965454, |
| "learning_rate": 1.54e-05, |
| "loss": 4.5572, |
| "step": 78 |
| }, |
| { |
| "epoch": 3.1553398058252426, |
| "grad_norm": 2.9183573722839355, |
| "learning_rate": 1.56e-05, |
| "loss": 4.453, |
| "step": 79 |
| }, |
| { |
| "epoch": 3.1941747572815533, |
| "grad_norm": 2.7854349613189697, |
| "learning_rate": 1.58e-05, |
| "loss": 4.5746, |
| "step": 80 |
| }, |
| { |
| "epoch": 3.233009708737864, |
| "grad_norm": 2.8391106128692627, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 4.5228, |
| "step": 81 |
| }, |
| { |
| "epoch": 3.2718446601941746, |
| "grad_norm": 2.5229265689849854, |
| "learning_rate": 1.62e-05, |
| "loss": 4.4692, |
| "step": 82 |
| }, |
| { |
| "epoch": 3.3106796116504853, |
| "grad_norm": 2.643170118331909, |
| "learning_rate": 1.6400000000000002e-05, |
| "loss": 4.498, |
| "step": 83 |
| }, |
| { |
| "epoch": 3.349514563106796, |
| "grad_norm": 2.542393922805786, |
| "learning_rate": 1.66e-05, |
| "loss": 4.4816, |
| "step": 84 |
| }, |
| { |
| "epoch": 3.3883495145631066, |
| "grad_norm": 2.563282012939453, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 4.4824, |
| "step": 85 |
| }, |
| { |
| "epoch": 3.4271844660194173, |
| "grad_norm": 2.698516368865967, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 4.4717, |
| "step": 86 |
| }, |
| { |
| "epoch": 3.466019417475728, |
| "grad_norm": 2.936776638031006, |
| "learning_rate": 1.7199999999999998e-05, |
| "loss": 4.346, |
| "step": 87 |
| }, |
| { |
| "epoch": 3.5048543689320386, |
| "grad_norm": 2.9594175815582275, |
| "learning_rate": 1.74e-05, |
| "loss": 4.3689, |
| "step": 88 |
| }, |
| { |
| "epoch": 3.5436893203883493, |
| "grad_norm": 3.02431583404541, |
| "learning_rate": 1.76e-05, |
| "loss": 4.3922, |
| "step": 89 |
| }, |
| { |
| "epoch": 3.58252427184466, |
| "grad_norm": 3.238933563232422, |
| "learning_rate": 1.78e-05, |
| "loss": 4.4046, |
| "step": 90 |
| }, |
| { |
| "epoch": 3.6213592233009706, |
| "grad_norm": 3.368084192276001, |
| "learning_rate": 1.8e-05, |
| "loss": 4.3768, |
| "step": 91 |
| }, |
| { |
| "epoch": 3.6601941747572817, |
| "grad_norm": 3.8072586059570312, |
| "learning_rate": 1.8200000000000002e-05, |
| "loss": 4.3188, |
| "step": 92 |
| }, |
| { |
| "epoch": 3.6990291262135924, |
| "grad_norm": 3.2370452880859375, |
| "learning_rate": 1.84e-05, |
| "loss": 4.3368, |
| "step": 93 |
| }, |
| { |
| "epoch": 3.737864077669903, |
| "grad_norm": 3.302961826324463, |
| "learning_rate": 1.86e-05, |
| "loss": 4.3339, |
| "step": 94 |
| }, |
| { |
| "epoch": 3.7766990291262137, |
| "grad_norm": 3.5947256088256836, |
| "learning_rate": 1.88e-05, |
| "loss": 4.2763, |
| "step": 95 |
| }, |
| { |
| "epoch": 3.8155339805825244, |
| "grad_norm": 2.955308437347412, |
| "learning_rate": 1.9e-05, |
| "loss": 4.3941, |
| "step": 96 |
| }, |
| { |
| "epoch": 3.854368932038835, |
| "grad_norm": 3.303628444671631, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 4.2748, |
| "step": 97 |
| }, |
| { |
| "epoch": 3.8932038834951457, |
| "grad_norm": 2.7507269382476807, |
| "learning_rate": 1.94e-05, |
| "loss": 4.2881, |
| "step": 98 |
| }, |
| { |
| "epoch": 3.9320388349514563, |
| "grad_norm": 2.6451849937438965, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 4.3818, |
| "step": 99 |
| }, |
| { |
| "epoch": 3.970873786407767, |
| "grad_norm": 4.112302780151367, |
| "learning_rate": 1.9800000000000004e-05, |
| "loss": 4.2882, |
| "step": 100 |
| }, |
| { |
| "epoch": 3.970873786407767, |
| "eval_loss": 4.266085624694824, |
| "eval_runtime": 1.036, |
| "eval_samples_per_second": 21.235, |
| "eval_steps_per_second": 5.791, |
| "step": 100 |
| }, |
| { |
| "epoch": 4.038834951456311, |
| "grad_norm": 5.2990498542785645, |
| "learning_rate": 2e-05, |
| "loss": 8.631, |
| "step": 101 |
| }, |
| { |
| "epoch": 4.077669902912621, |
| "grad_norm": 3.757814407348633, |
| "learning_rate": 2.0200000000000003e-05, |
| "loss": 4.2183, |
| "step": 102 |
| }, |
| { |
| "epoch": 4.116504854368932, |
| "grad_norm": 2.905704975128174, |
| "learning_rate": 2.04e-05, |
| "loss": 4.1782, |
| "step": 103 |
| }, |
| { |
| "epoch": 4.155339805825243, |
| "grad_norm": 3.7264492511749268, |
| "learning_rate": 2.06e-05, |
| "loss": 4.2959, |
| "step": 104 |
| }, |
| { |
| "epoch": 4.194174757281553, |
| "grad_norm": 3.9989054203033447, |
| "learning_rate": 2.08e-05, |
| "loss": 4.1876, |
| "step": 105 |
| }, |
| { |
| "epoch": 4.233009708737864, |
| "grad_norm": 2.978239059448242, |
| "learning_rate": 2.1e-05, |
| "loss": 4.1484, |
| "step": 106 |
| }, |
| { |
| "epoch": 4.271844660194175, |
| "grad_norm": 3.223487138748169, |
| "learning_rate": 2.12e-05, |
| "loss": 4.1501, |
| "step": 107 |
| }, |
| { |
| "epoch": 4.310679611650485, |
| "grad_norm": 3.035008668899536, |
| "learning_rate": 2.1400000000000002e-05, |
| "loss": 4.1316, |
| "step": 108 |
| }, |
| { |
| "epoch": 4.349514563106796, |
| "grad_norm": 2.878307819366455, |
| "learning_rate": 2.16e-05, |
| "loss": 4.1824, |
| "step": 109 |
| }, |
| { |
| "epoch": 4.388349514563107, |
| "grad_norm": 3.095815420150757, |
| "learning_rate": 2.18e-05, |
| "loss": 4.1726, |
| "step": 110 |
| }, |
| { |
| "epoch": 4.427184466019417, |
| "grad_norm": 3.0754470825195312, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 3.9618, |
| "step": 111 |
| }, |
| { |
| "epoch": 4.466019417475728, |
| "grad_norm": 3.4234559535980225, |
| "learning_rate": 2.22e-05, |
| "loss": 4.0646, |
| "step": 112 |
| }, |
| { |
| "epoch": 4.504854368932039, |
| "grad_norm": 3.2128183841705322, |
| "learning_rate": 2.2400000000000002e-05, |
| "loss": 4.0639, |
| "step": 113 |
| }, |
| { |
| "epoch": 4.543689320388349, |
| "grad_norm": 2.9789934158325195, |
| "learning_rate": 2.26e-05, |
| "loss": 4.1373, |
| "step": 114 |
| }, |
| { |
| "epoch": 4.58252427184466, |
| "grad_norm": 2.5928032398223877, |
| "learning_rate": 2.2800000000000002e-05, |
| "loss": 3.9855, |
| "step": 115 |
| }, |
| { |
| "epoch": 4.621359223300971, |
| "grad_norm": 3.082489252090454, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 4.1163, |
| "step": 116 |
| }, |
| { |
| "epoch": 4.660194174757281, |
| "grad_norm": 3.028413772583008, |
| "learning_rate": 2.32e-05, |
| "loss": 4.0571, |
| "step": 117 |
| }, |
| { |
| "epoch": 4.699029126213592, |
| "grad_norm": 2.8744428157806396, |
| "learning_rate": 2.3400000000000003e-05, |
| "loss": 4.027, |
| "step": 118 |
| }, |
| { |
| "epoch": 4.737864077669903, |
| "grad_norm": 2.866056442260742, |
| "learning_rate": 2.36e-05, |
| "loss": 4.0299, |
| "step": 119 |
| }, |
| { |
| "epoch": 4.776699029126213, |
| "grad_norm": 2.75072979927063, |
| "learning_rate": 2.38e-05, |
| "loss": 3.993, |
| "step": 120 |
| }, |
| { |
| "epoch": 4.815533980582524, |
| "grad_norm": 2.8751604557037354, |
| "learning_rate": 2.4e-05, |
| "loss": 3.9961, |
| "step": 121 |
| }, |
| { |
| "epoch": 4.854368932038835, |
| "grad_norm": 2.5905075073242188, |
| "learning_rate": 2.4200000000000002e-05, |
| "loss": 3.9582, |
| "step": 122 |
| }, |
| { |
| "epoch": 4.893203883495145, |
| "grad_norm": 3.143044948577881, |
| "learning_rate": 2.44e-05, |
| "loss": 3.9464, |
| "step": 123 |
| }, |
| { |
| "epoch": 4.932038834951456, |
| "grad_norm": 2.6397016048431396, |
| "learning_rate": 2.46e-05, |
| "loss": 4.0075, |
| "step": 124 |
| }, |
| { |
| "epoch": 4.970873786407767, |
| "grad_norm": 3.2383229732513428, |
| "learning_rate": 2.48e-05, |
| "loss": 3.9822, |
| "step": 125 |
| }, |
| { |
| "epoch": 4.970873786407767, |
| "eval_loss": 3.980665445327759, |
| "eval_runtime": 1.0248, |
| "eval_samples_per_second": 21.467, |
| "eval_steps_per_second": 5.855, |
| "step": 125 |
| }, |
| { |
| "epoch": 5.038834951456311, |
| "grad_norm": 5.962584495544434, |
| "learning_rate": 2.5e-05, |
| "loss": 7.7604, |
| "step": 126 |
| }, |
| { |
| "epoch": 5.077669902912621, |
| "grad_norm": 3.243708610534668, |
| "learning_rate": 2.5200000000000003e-05, |
| "loss": 3.9438, |
| "step": 127 |
| }, |
| { |
| "epoch": 5.116504854368932, |
| "grad_norm": 2.763148307800293, |
| "learning_rate": 2.54e-05, |
| "loss": 3.8661, |
| "step": 128 |
| }, |
| { |
| "epoch": 5.155339805825243, |
| "grad_norm": 2.6233339309692383, |
| "learning_rate": 2.5600000000000002e-05, |
| "loss": 3.9006, |
| "step": 129 |
| }, |
| { |
| "epoch": 5.194174757281553, |
| "grad_norm": 3.1037437915802, |
| "learning_rate": 2.58e-05, |
| "loss": 3.9066, |
| "step": 130 |
| }, |
| { |
| "epoch": 5.233009708737864, |
| "grad_norm": 3.3434383869171143, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 3.8425, |
| "step": 131 |
| }, |
| { |
| "epoch": 5.271844660194175, |
| "grad_norm": 3.0016958713531494, |
| "learning_rate": 2.6200000000000003e-05, |
| "loss": 3.8723, |
| "step": 132 |
| }, |
| { |
| "epoch": 5.310679611650485, |
| "grad_norm": 3.2040951251983643, |
| "learning_rate": 2.64e-05, |
| "loss": 3.8326, |
| "step": 133 |
| }, |
| { |
| "epoch": 5.349514563106796, |
| "grad_norm": 3.892890453338623, |
| "learning_rate": 2.6600000000000003e-05, |
| "loss": 3.9277, |
| "step": 134 |
| }, |
| { |
| "epoch": 5.388349514563107, |
| "grad_norm": 3.3505635261535645, |
| "learning_rate": 2.6800000000000004e-05, |
| "loss": 3.7381, |
| "step": 135 |
| }, |
| { |
| "epoch": 5.427184466019417, |
| "grad_norm": 3.60493803024292, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 3.9003, |
| "step": 136 |
| }, |
| { |
| "epoch": 5.466019417475728, |
| "grad_norm": 3.3468196392059326, |
| "learning_rate": 2.7200000000000004e-05, |
| "loss": 3.829, |
| "step": 137 |
| }, |
| { |
| "epoch": 5.504854368932039, |
| "grad_norm": 2.7208919525146484, |
| "learning_rate": 2.7400000000000002e-05, |
| "loss": 3.7987, |
| "step": 138 |
| }, |
| { |
| "epoch": 5.543689320388349, |
| "grad_norm": 4.0348920822143555, |
| "learning_rate": 2.7600000000000003e-05, |
| "loss": 3.8318, |
| "step": 139 |
| }, |
| { |
| "epoch": 5.58252427184466, |
| "grad_norm": 3.560403347015381, |
| "learning_rate": 2.7800000000000005e-05, |
| "loss": 3.763, |
| "step": 140 |
| }, |
| { |
| "epoch": 5.621359223300971, |
| "grad_norm": 3.262423515319824, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 3.7441, |
| "step": 141 |
| }, |
| { |
| "epoch": 5.660194174757281, |
| "grad_norm": 2.7930023670196533, |
| "learning_rate": 2.8199999999999998e-05, |
| "loss": 3.7323, |
| "step": 142 |
| }, |
| { |
| "epoch": 5.699029126213592, |
| "grad_norm": 2.5322391986846924, |
| "learning_rate": 2.84e-05, |
| "loss": 3.6681, |
| "step": 143 |
| }, |
| { |
| "epoch": 5.737864077669903, |
| "grad_norm": 4.258012294769287, |
| "learning_rate": 2.86e-05, |
| "loss": 3.7049, |
| "step": 144 |
| }, |
| { |
| "epoch": 5.776699029126213, |
| "grad_norm": 3.0756101608276367, |
| "learning_rate": 2.88e-05, |
| "loss": 3.7184, |
| "step": 145 |
| }, |
| { |
| "epoch": 5.815533980582524, |
| "grad_norm": 3.0040361881256104, |
| "learning_rate": 2.9e-05, |
| "loss": 3.6077, |
| "step": 146 |
| }, |
| { |
| "epoch": 5.854368932038835, |
| "grad_norm": 4.292761325836182, |
| "learning_rate": 2.9199999999999998e-05, |
| "loss": 3.7214, |
| "step": 147 |
| }, |
| { |
| "epoch": 5.893203883495145, |
| "grad_norm": 2.876159906387329, |
| "learning_rate": 2.94e-05, |
| "loss": 3.6643, |
| "step": 148 |
| }, |
| { |
| "epoch": 5.932038834951456, |
| "grad_norm": 3.1686434745788574, |
| "learning_rate": 2.96e-05, |
| "loss": 3.68, |
| "step": 149 |
| }, |
| { |
| "epoch": 5.970873786407767, |
| "grad_norm": 3.1515626907348633, |
| "learning_rate": 2.98e-05, |
| "loss": 3.6581, |
| "step": 150 |
| }, |
| { |
| "epoch": 5.970873786407767, |
| "eval_loss": 3.7385447025299072, |
| "eval_runtime": 1.0393, |
| "eval_samples_per_second": 21.169, |
| "eval_steps_per_second": 5.773, |
| "step": 150 |
| }, |
| { |
| "epoch": 6.038834951456311, |
| "grad_norm": 6.013641834259033, |
| "learning_rate": 3e-05, |
| "loss": 7.2601, |
| "step": 151 |
| }, |
| { |
| "epoch": 6.077669902912621, |
| "grad_norm": 3.0433292388916016, |
| "learning_rate": 3.02e-05, |
| "loss": 3.626, |
| "step": 152 |
| }, |
| { |
| "epoch": 6.116504854368932, |
| "grad_norm": 2.9623515605926514, |
| "learning_rate": 3.04e-05, |
| "loss": 3.5856, |
| "step": 153 |
| }, |
| { |
| "epoch": 6.155339805825243, |
| "grad_norm": 3.333615779876709, |
| "learning_rate": 3.06e-05, |
| "loss": 3.6268, |
| "step": 154 |
| }, |
| { |
| "epoch": 6.194174757281553, |
| "grad_norm": 3.0843307971954346, |
| "learning_rate": 3.08e-05, |
| "loss": 3.5651, |
| "step": 155 |
| }, |
| { |
| "epoch": 6.233009708737864, |
| "grad_norm": 2.859063148498535, |
| "learning_rate": 3.1e-05, |
| "loss": 3.5464, |
| "step": 156 |
| }, |
| { |
| "epoch": 6.271844660194175, |
| "grad_norm": 2.92948842048645, |
| "learning_rate": 3.12e-05, |
| "loss": 3.6385, |
| "step": 157 |
| }, |
| { |
| "epoch": 6.310679611650485, |
| "grad_norm": 3.552112340927124, |
| "learning_rate": 3.1400000000000004e-05, |
| "loss": 3.5479, |
| "step": 158 |
| }, |
| { |
| "epoch": 6.349514563106796, |
| "grad_norm": 2.9934771060943604, |
| "learning_rate": 3.16e-05, |
| "loss": 3.5697, |
| "step": 159 |
| }, |
| { |
| "epoch": 6.388349514563107, |
| "grad_norm": 2.595054864883423, |
| "learning_rate": 3.18e-05, |
| "loss": 3.4817, |
| "step": 160 |
| }, |
| { |
| "epoch": 6.427184466019417, |
| "grad_norm": 3.077573537826538, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 3.5286, |
| "step": 161 |
| }, |
| { |
| "epoch": 6.466019417475728, |
| "grad_norm": 2.5149052143096924, |
| "learning_rate": 3.2200000000000003e-05, |
| "loss": 3.6065, |
| "step": 162 |
| }, |
| { |
| "epoch": 6.504854368932039, |
| "grad_norm": 2.6401753425598145, |
| "learning_rate": 3.24e-05, |
| "loss": 3.4707, |
| "step": 163 |
| }, |
| { |
| "epoch": 6.543689320388349, |
| "grad_norm": 2.725781202316284, |
| "learning_rate": 3.26e-05, |
| "loss": 3.5645, |
| "step": 164 |
| }, |
| { |
| "epoch": 6.58252427184466, |
| "grad_norm": 2.7084786891937256, |
| "learning_rate": 3.2800000000000004e-05, |
| "loss": 3.5482, |
| "step": 165 |
| }, |
| { |
| "epoch": 6.621359223300971, |
| "grad_norm": 2.6076486110687256, |
| "learning_rate": 3.3e-05, |
| "loss": 3.4842, |
| "step": 166 |
| }, |
| { |
| "epoch": 6.660194174757281, |
| "grad_norm": 3.0237390995025635, |
| "learning_rate": 3.32e-05, |
| "loss": 3.5313, |
| "step": 167 |
| }, |
| { |
| "epoch": 6.699029126213592, |
| "grad_norm": 2.807459831237793, |
| "learning_rate": 3.3400000000000005e-05, |
| "loss": 3.5354, |
| "step": 168 |
| }, |
| { |
| "epoch": 6.737864077669903, |
| "grad_norm": 3.13301420211792, |
| "learning_rate": 3.3600000000000004e-05, |
| "loss": 3.4923, |
| "step": 169 |
| }, |
| { |
| "epoch": 6.776699029126213, |
| "grad_norm": 2.5862674713134766, |
| "learning_rate": 3.38e-05, |
| "loss": 3.5315, |
| "step": 170 |
| }, |
| { |
| "epoch": 6.815533980582524, |
| "grad_norm": 3.192603588104248, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 3.4937, |
| "step": 171 |
| }, |
| { |
| "epoch": 6.854368932038835, |
| "grad_norm": 2.440667152404785, |
| "learning_rate": 3.4200000000000005e-05, |
| "loss": 3.4632, |
| "step": 172 |
| }, |
| { |
| "epoch": 6.893203883495145, |
| "grad_norm": 3.0425989627838135, |
| "learning_rate": 3.4399999999999996e-05, |
| "loss": 3.4456, |
| "step": 173 |
| }, |
| { |
| "epoch": 6.932038834951456, |
| "grad_norm": 3.369929313659668, |
| "learning_rate": 3.46e-05, |
| "loss": 3.4061, |
| "step": 174 |
| }, |
| { |
| "epoch": 6.970873786407767, |
| "grad_norm": 2.883514165878296, |
| "learning_rate": 3.48e-05, |
| "loss": 3.4312, |
| "step": 175 |
| }, |
| { |
| "epoch": 6.970873786407767, |
| "eval_loss": 3.5276877880096436, |
| "eval_runtime": 0.9695, |
| "eval_samples_per_second": 22.692, |
| "eval_steps_per_second": 6.189, |
| "step": 175 |
| }, |
| { |
| "epoch": 7.038834951456311, |
| "grad_norm": 5.757262706756592, |
| "learning_rate": 3.5e-05, |
| "loss": 6.8588, |
| "step": 176 |
| }, |
| { |
| "epoch": 7.077669902912621, |
| "grad_norm": 2.7623355388641357, |
| "learning_rate": 3.52e-05, |
| "loss": 3.389, |
| "step": 177 |
| }, |
| { |
| "epoch": 7.116504854368932, |
| "grad_norm": 3.601408004760742, |
| "learning_rate": 3.54e-05, |
| "loss": 3.4136, |
| "step": 178 |
| }, |
| { |
| "epoch": 7.155339805825243, |
| "grad_norm": 2.4193849563598633, |
| "learning_rate": 3.56e-05, |
| "loss": 3.3387, |
| "step": 179 |
| }, |
| { |
| "epoch": 7.194174757281553, |
| "grad_norm": 3.1988773345947266, |
| "learning_rate": 3.58e-05, |
| "loss": 3.3565, |
| "step": 180 |
| }, |
| { |
| "epoch": 7.233009708737864, |
| "grad_norm": 3.6124112606048584, |
| "learning_rate": 3.6e-05, |
| "loss": 3.3662, |
| "step": 181 |
| }, |
| { |
| "epoch": 7.271844660194175, |
| "grad_norm": 2.836766242980957, |
| "learning_rate": 3.62e-05, |
| "loss": 3.2874, |
| "step": 182 |
| }, |
| { |
| "epoch": 7.310679611650485, |
| "grad_norm": 3.2610206604003906, |
| "learning_rate": 3.6400000000000004e-05, |
| "loss": 3.2984, |
| "step": 183 |
| }, |
| { |
| "epoch": 7.349514563106796, |
| "grad_norm": 2.8655757904052734, |
| "learning_rate": 3.66e-05, |
| "loss": 3.2584, |
| "step": 184 |
| }, |
| { |
| "epoch": 7.388349514563107, |
| "grad_norm": 3.4718809127807617, |
| "learning_rate": 3.68e-05, |
| "loss": 3.2489, |
| "step": 185 |
| }, |
| { |
| "epoch": 7.427184466019417, |
| "grad_norm": 3.2131571769714355, |
| "learning_rate": 3.7e-05, |
| "loss": 3.3321, |
| "step": 186 |
| }, |
| { |
| "epoch": 7.466019417475728, |
| "grad_norm": 3.1714115142822266, |
| "learning_rate": 3.72e-05, |
| "loss": 3.3489, |
| "step": 187 |
| }, |
| { |
| "epoch": 7.504854368932039, |
| "grad_norm": 2.877065658569336, |
| "learning_rate": 3.74e-05, |
| "loss": 3.245, |
| "step": 188 |
| }, |
| { |
| "epoch": 7.543689320388349, |
| "grad_norm": 3.1105806827545166, |
| "learning_rate": 3.76e-05, |
| "loss": 3.272, |
| "step": 189 |
| }, |
| { |
| "epoch": 7.58252427184466, |
| "grad_norm": 3.5332155227661133, |
| "learning_rate": 3.7800000000000004e-05, |
| "loss": 3.3132, |
| "step": 190 |
| }, |
| { |
| "epoch": 7.621359223300971, |
| "grad_norm": 2.8226609230041504, |
| "learning_rate": 3.8e-05, |
| "loss": 3.2721, |
| "step": 191 |
| }, |
| { |
| "epoch": 7.660194174757281, |
| "grad_norm": 2.5367422103881836, |
| "learning_rate": 3.82e-05, |
| "loss": 3.3234, |
| "step": 192 |
| }, |
| { |
| "epoch": 7.699029126213592, |
| "grad_norm": 2.9826626777648926, |
| "learning_rate": 3.8400000000000005e-05, |
| "loss": 3.2643, |
| "step": 193 |
| }, |
| { |
| "epoch": 7.737864077669903, |
| "grad_norm": 3.456496477127075, |
| "learning_rate": 3.86e-05, |
| "loss": 3.2105, |
| "step": 194 |
| }, |
| { |
| "epoch": 7.776699029126213, |
| "grad_norm": 3.286680221557617, |
| "learning_rate": 3.88e-05, |
| "loss": 3.2156, |
| "step": 195 |
| }, |
| { |
| "epoch": 7.815533980582524, |
| "grad_norm": 2.996983528137207, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 3.3637, |
| "step": 196 |
| }, |
| { |
| "epoch": 7.854368932038835, |
| "grad_norm": 3.129873037338257, |
| "learning_rate": 3.9200000000000004e-05, |
| "loss": 3.2444, |
| "step": 197 |
| }, |
| { |
| "epoch": 7.893203883495145, |
| "grad_norm": 2.591716766357422, |
| "learning_rate": 3.94e-05, |
| "loss": 3.2831, |
| "step": 198 |
| }, |
| { |
| "epoch": 7.932038834951456, |
| "grad_norm": 2.664017677307129, |
| "learning_rate": 3.960000000000001e-05, |
| "loss": 3.1692, |
| "step": 199 |
| }, |
| { |
| "epoch": 7.970873786407767, |
| "grad_norm": 2.8941309452056885, |
| "learning_rate": 3.9800000000000005e-05, |
| "loss": 3.2986, |
| "step": 200 |
| }, |
| { |
| "epoch": 7.970873786407767, |
| "eval_loss": 3.3523428440093994, |
| "eval_runtime": 0.9896, |
| "eval_samples_per_second": 22.23, |
| "eval_steps_per_second": 6.063, |
| "step": 200 |
| }, |
| { |
| "epoch": 8.03883495145631, |
| "grad_norm": 6.677456378936768, |
| "learning_rate": 4e-05, |
| "loss": 6.5052, |
| "step": 201 |
| }, |
| { |
| "epoch": 8.077669902912621, |
| "grad_norm": 3.38222599029541, |
| "learning_rate": 4.02e-05, |
| "loss": 3.1682, |
| "step": 202 |
| }, |
| { |
| "epoch": 8.116504854368932, |
| "grad_norm": 2.9144835472106934, |
| "learning_rate": 4.0400000000000006e-05, |
| "loss": 3.1505, |
| "step": 203 |
| }, |
| { |
| "epoch": 8.155339805825243, |
| "grad_norm": 2.837830066680908, |
| "learning_rate": 4.0600000000000004e-05, |
| "loss": 3.1576, |
| "step": 204 |
| }, |
| { |
| "epoch": 8.194174757281553, |
| "grad_norm": 3.3070290088653564, |
| "learning_rate": 4.08e-05, |
| "loss": 3.1545, |
| "step": 205 |
| }, |
| { |
| "epoch": 8.233009708737864, |
| "grad_norm": 2.6031386852264404, |
| "learning_rate": 4.1e-05, |
| "loss": 3.1598, |
| "step": 206 |
| }, |
| { |
| "epoch": 8.271844660194175, |
| "grad_norm": 2.879425525665283, |
| "learning_rate": 4.12e-05, |
| "loss": 3.1222, |
| "step": 207 |
| }, |
| { |
| "epoch": 8.310679611650485, |
| "grad_norm": 3.14932918548584, |
| "learning_rate": 4.14e-05, |
| "loss": 3.0743, |
| "step": 208 |
| }, |
| { |
| "epoch": 8.349514563106796, |
| "grad_norm": 3.3993191719055176, |
| "learning_rate": 4.16e-05, |
| "loss": 3.1589, |
| "step": 209 |
| }, |
| { |
| "epoch": 8.388349514563107, |
| "grad_norm": 3.2141942977905273, |
| "learning_rate": 4.18e-05, |
| "loss": 3.0428, |
| "step": 210 |
| }, |
| { |
| "epoch": 8.427184466019417, |
| "grad_norm": 2.791717290878296, |
| "learning_rate": 4.2e-05, |
| "loss": 3.1158, |
| "step": 211 |
| }, |
| { |
| "epoch": 8.466019417475728, |
| "grad_norm": 3.1668970584869385, |
| "learning_rate": 4.22e-05, |
| "loss": 3.0463, |
| "step": 212 |
| }, |
| { |
| "epoch": 8.504854368932039, |
| "grad_norm": 2.4356696605682373, |
| "learning_rate": 4.24e-05, |
| "loss": 3.1434, |
| "step": 213 |
| }, |
| { |
| "epoch": 8.54368932038835, |
| "grad_norm": 2.9241132736206055, |
| "learning_rate": 4.26e-05, |
| "loss": 3.0292, |
| "step": 214 |
| }, |
| { |
| "epoch": 8.58252427184466, |
| "grad_norm": 2.4170773029327393, |
| "learning_rate": 4.2800000000000004e-05, |
| "loss": 3.0923, |
| "step": 215 |
| }, |
| { |
| "epoch": 8.62135922330097, |
| "grad_norm": 2.4428963661193848, |
| "learning_rate": 4.3e-05, |
| "loss": 3.0588, |
| "step": 216 |
| }, |
| { |
| "epoch": 8.660194174757281, |
| "grad_norm": 3.0066943168640137, |
| "learning_rate": 4.32e-05, |
| "loss": 3.0815, |
| "step": 217 |
| }, |
| { |
| "epoch": 8.699029126213592, |
| "grad_norm": 3.0532405376434326, |
| "learning_rate": 4.3400000000000005e-05, |
| "loss": 3.1377, |
| "step": 218 |
| }, |
| { |
| "epoch": 8.737864077669903, |
| "grad_norm": 2.9405910968780518, |
| "learning_rate": 4.36e-05, |
| "loss": 3.081, |
| "step": 219 |
| }, |
| { |
| "epoch": 8.776699029126213, |
| "grad_norm": 2.82438325881958, |
| "learning_rate": 4.38e-05, |
| "loss": 3.0538, |
| "step": 220 |
| }, |
| { |
| "epoch": 8.815533980582524, |
| "grad_norm": 2.899946928024292, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 3.0664, |
| "step": 221 |
| }, |
| { |
| "epoch": 8.854368932038835, |
| "grad_norm": 2.4132299423217773, |
| "learning_rate": 4.4200000000000004e-05, |
| "loss": 3.0723, |
| "step": 222 |
| }, |
| { |
| "epoch": 8.893203883495145, |
| "grad_norm": 3.2833642959594727, |
| "learning_rate": 4.44e-05, |
| "loss": 3.0445, |
| "step": 223 |
| }, |
| { |
| "epoch": 8.932038834951456, |
| "grad_norm": 2.60457706451416, |
| "learning_rate": 4.46e-05, |
| "loss": 3.0134, |
| "step": 224 |
| }, |
| { |
| "epoch": 8.970873786407767, |
| "grad_norm": 2.7552649974823, |
| "learning_rate": 4.4800000000000005e-05, |
| "loss": 3.0772, |
| "step": 225 |
| }, |
| { |
| "epoch": 8.970873786407767, |
| "eval_loss": 3.2081830501556396, |
| "eval_runtime": 0.971, |
| "eval_samples_per_second": 22.657, |
| "eval_steps_per_second": 6.179, |
| "step": 225 |
| }, |
| { |
| "epoch": 9.03883495145631, |
| "grad_norm": 6.834669589996338, |
| "learning_rate": 4.5e-05, |
| "loss": 6.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 9.077669902912621, |
| "grad_norm": 2.7431795597076416, |
| "learning_rate": 4.52e-05, |
| "loss": 2.8938, |
| "step": 227 |
| }, |
| { |
| "epoch": 9.116504854368932, |
| "grad_norm": 3.5260982513427734, |
| "learning_rate": 4.5400000000000006e-05, |
| "loss": 2.955, |
| "step": 228 |
| }, |
| { |
| "epoch": 9.155339805825243, |
| "grad_norm": 2.705111026763916, |
| "learning_rate": 4.5600000000000004e-05, |
| "loss": 2.9586, |
| "step": 229 |
| }, |
| { |
| "epoch": 9.194174757281553, |
| "grad_norm": 3.2462103366851807, |
| "learning_rate": 4.58e-05, |
| "loss": 2.8973, |
| "step": 230 |
| }, |
| { |
| "epoch": 9.233009708737864, |
| "grad_norm": 2.788363218307495, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 2.9328, |
| "step": 231 |
| }, |
| { |
| "epoch": 9.271844660194175, |
| "grad_norm": 2.4145243167877197, |
| "learning_rate": 4.6200000000000005e-05, |
| "loss": 2.9653, |
| "step": 232 |
| }, |
| { |
| "epoch": 9.310679611650485, |
| "grad_norm": 3.2271153926849365, |
| "learning_rate": 4.64e-05, |
| "loss": 2.922, |
| "step": 233 |
| }, |
| { |
| "epoch": 9.349514563106796, |
| "grad_norm": 2.3625218868255615, |
| "learning_rate": 4.660000000000001e-05, |
| "loss": 3.0413, |
| "step": 234 |
| }, |
| { |
| "epoch": 9.388349514563107, |
| "grad_norm": 3.17262864112854, |
| "learning_rate": 4.6800000000000006e-05, |
| "loss": 2.9962, |
| "step": 235 |
| }, |
| { |
| "epoch": 9.427184466019417, |
| "grad_norm": 2.906003475189209, |
| "learning_rate": 4.7e-05, |
| "loss": 2.9422, |
| "step": 236 |
| }, |
| { |
| "epoch": 9.466019417475728, |
| "grad_norm": 2.1498398780822754, |
| "learning_rate": 4.72e-05, |
| "loss": 2.9061, |
| "step": 237 |
| }, |
| { |
| "epoch": 9.504854368932039, |
| "grad_norm": 2.9519286155700684, |
| "learning_rate": 4.74e-05, |
| "loss": 2.967, |
| "step": 238 |
| }, |
| { |
| "epoch": 9.54368932038835, |
| "grad_norm": 2.561063528060913, |
| "learning_rate": 4.76e-05, |
| "loss": 2.9191, |
| "step": 239 |
| }, |
| { |
| "epoch": 9.58252427184466, |
| "grad_norm": 3.8291261196136475, |
| "learning_rate": 4.78e-05, |
| "loss": 2.9071, |
| "step": 240 |
| }, |
| { |
| "epoch": 9.62135922330097, |
| "grad_norm": 3.4280309677124023, |
| "learning_rate": 4.8e-05, |
| "loss": 2.9384, |
| "step": 241 |
| }, |
| { |
| "epoch": 9.660194174757281, |
| "grad_norm": 3.460054397583008, |
| "learning_rate": 4.82e-05, |
| "loss": 2.9387, |
| "step": 242 |
| }, |
| { |
| "epoch": 9.699029126213592, |
| "grad_norm": 3.3750805854797363, |
| "learning_rate": 4.8400000000000004e-05, |
| "loss": 2.9552, |
| "step": 243 |
| }, |
| { |
| "epoch": 9.737864077669903, |
| "grad_norm": 2.6689562797546387, |
| "learning_rate": 4.86e-05, |
| "loss": 2.8809, |
| "step": 244 |
| }, |
| { |
| "epoch": 9.776699029126213, |
| "grad_norm": 2.9314560890197754, |
| "learning_rate": 4.88e-05, |
| "loss": 2.7902, |
| "step": 245 |
| }, |
| { |
| "epoch": 9.815533980582524, |
| "grad_norm": 2.630530595779419, |
| "learning_rate": 4.9e-05, |
| "loss": 2.8857, |
| "step": 246 |
| }, |
| { |
| "epoch": 9.854368932038835, |
| "grad_norm": 2.546659231185913, |
| "learning_rate": 4.92e-05, |
| "loss": 2.8896, |
| "step": 247 |
| }, |
| { |
| "epoch": 9.893203883495145, |
| "grad_norm": 2.795778751373291, |
| "learning_rate": 4.94e-05, |
| "loss": 2.9516, |
| "step": 248 |
| }, |
| { |
| "epoch": 9.932038834951456, |
| "grad_norm": 3.0504794120788574, |
| "learning_rate": 4.96e-05, |
| "loss": 2.9132, |
| "step": 249 |
| }, |
| { |
| "epoch": 9.970873786407767, |
| "grad_norm": 3.444287061691284, |
| "learning_rate": 4.9800000000000004e-05, |
| "loss": 2.973, |
| "step": 250 |
| }, |
| { |
| "epoch": 9.970873786407767, |
| "eval_loss": 3.089428186416626, |
| "eval_runtime": 0.9754, |
| "eval_samples_per_second": 22.555, |
| "eval_steps_per_second": 6.151, |
| "step": 250 |
| }, |
| { |
| "epoch": 10.03883495145631, |
| "grad_norm": 5.480017185211182, |
| "learning_rate": 5e-05, |
| "loss": 5.6196, |
| "step": 251 |
| }, |
| { |
| "epoch": 10.077669902912621, |
| "grad_norm": 3.3957669734954834, |
| "learning_rate": 4.999997563061038e-05, |
| "loss": 2.8152, |
| "step": 252 |
| }, |
| { |
| "epoch": 10.116504854368932, |
| "grad_norm": 2.6747496128082275, |
| "learning_rate": 4.9999902522489015e-05, |
| "loss": 2.8624, |
| "step": 253 |
| }, |
| { |
| "epoch": 10.155339805825243, |
| "grad_norm": 3.2186131477355957, |
| "learning_rate": 4.999978067577844e-05, |
| "loss": 2.7587, |
| "step": 254 |
| }, |
| { |
| "epoch": 10.194174757281553, |
| "grad_norm": 3.7385358810424805, |
| "learning_rate": 4.999961009071621e-05, |
| "loss": 2.8117, |
| "step": 255 |
| }, |
| { |
| "epoch": 10.233009708737864, |
| "grad_norm": 2.586005926132202, |
| "learning_rate": 4.999939076763487e-05, |
| "loss": 2.7617, |
| "step": 256 |
| }, |
| { |
| "epoch": 10.271844660194175, |
| "grad_norm": 2.7468533515930176, |
| "learning_rate": 4.999912270696202e-05, |
| "loss": 2.802, |
| "step": 257 |
| }, |
| { |
| "epoch": 10.310679611650485, |
| "grad_norm": 2.7268691062927246, |
| "learning_rate": 4.999880590922025e-05, |
| "loss": 2.7928, |
| "step": 258 |
| }, |
| { |
| "epoch": 10.349514563106796, |
| "grad_norm": 2.6305949687957764, |
| "learning_rate": 4.9998440375027166e-05, |
| "loss": 2.8245, |
| "step": 259 |
| }, |
| { |
| "epoch": 10.388349514563107, |
| "grad_norm": 2.8977084159851074, |
| "learning_rate": 4.9998026105095405e-05, |
| "loss": 2.7525, |
| "step": 260 |
| }, |
| { |
| "epoch": 10.427184466019417, |
| "grad_norm": 2.394578218460083, |
| "learning_rate": 4.999756310023261e-05, |
| "loss": 2.731, |
| "step": 261 |
| }, |
| { |
| "epoch": 10.466019417475728, |
| "grad_norm": 3.0859174728393555, |
| "learning_rate": 4.9997051361341425e-05, |
| "loss": 2.7902, |
| "step": 262 |
| }, |
| { |
| "epoch": 10.504854368932039, |
| "grad_norm": 2.929978370666504, |
| "learning_rate": 4.9996490889419514e-05, |
| "loss": 2.7723, |
| "step": 263 |
| }, |
| { |
| "epoch": 10.54368932038835, |
| "grad_norm": 2.6215100288391113, |
| "learning_rate": 4.999588168555954e-05, |
| "loss": 2.7892, |
| "step": 264 |
| }, |
| { |
| "epoch": 10.58252427184466, |
| "grad_norm": 2.744954824447632, |
| "learning_rate": 4.999522375094919e-05, |
| "loss": 2.8024, |
| "step": 265 |
| }, |
| { |
| "epoch": 10.62135922330097, |
| "grad_norm": 2.775912046432495, |
| "learning_rate": 4.999451708687114e-05, |
| "loss": 2.642, |
| "step": 266 |
| }, |
| { |
| "epoch": 10.660194174757281, |
| "grad_norm": 2.5821340084075928, |
| "learning_rate": 4.999376169470306e-05, |
| "loss": 2.7808, |
| "step": 267 |
| }, |
| { |
| "epoch": 10.699029126213592, |
| "grad_norm": 2.4101083278656006, |
| "learning_rate": 4.999295757591762e-05, |
| "loss": 2.7318, |
| "step": 268 |
| }, |
| { |
| "epoch": 10.737864077669903, |
| "grad_norm": 2.4816181659698486, |
| "learning_rate": 4.99921047320825e-05, |
| "loss": 2.7707, |
| "step": 269 |
| }, |
| { |
| "epoch": 10.776699029126213, |
| "grad_norm": 2.366009473800659, |
| "learning_rate": 4.9991203164860365e-05, |
| "loss": 2.7481, |
| "step": 270 |
| }, |
| { |
| "epoch": 10.815533980582524, |
| "grad_norm": 2.9792630672454834, |
| "learning_rate": 4.999025287600886e-05, |
| "loss": 2.7204, |
| "step": 271 |
| }, |
| { |
| "epoch": 10.854368932038835, |
| "grad_norm": 3.0781967639923096, |
| "learning_rate": 4.998925386738063e-05, |
| "loss": 2.7248, |
| "step": 272 |
| }, |
| { |
| "epoch": 10.893203883495145, |
| "grad_norm": 2.6866307258605957, |
| "learning_rate": 4.998820614092328e-05, |
| "loss": 2.7456, |
| "step": 273 |
| }, |
| { |
| "epoch": 10.932038834951456, |
| "grad_norm": 2.789808988571167, |
| "learning_rate": 4.998710969867942e-05, |
| "loss": 2.7224, |
| "step": 274 |
| }, |
| { |
| "epoch": 10.970873786407767, |
| "grad_norm": 2.4948067665100098, |
| "learning_rate": 4.9985964542786614e-05, |
| "loss": 2.6724, |
| "step": 275 |
| }, |
| { |
| "epoch": 10.970873786407767, |
| "eval_loss": 2.9974570274353027, |
| "eval_runtime": 0.9771, |
| "eval_samples_per_second": 22.516, |
| "eval_steps_per_second": 6.141, |
| "step": 275 |
| }, |
| { |
| "epoch": 11.03883495145631, |
| "grad_norm": 4.32741117477417, |
| "learning_rate": 4.99847706754774e-05, |
| "loss": 5.4423, |
| "step": 276 |
| }, |
| { |
| "epoch": 11.077669902912621, |
| "grad_norm": 2.3572208881378174, |
| "learning_rate": 4.998352809907928e-05, |
| "loss": 2.6372, |
| "step": 277 |
| }, |
| { |
| "epoch": 11.116504854368932, |
| "grad_norm": 2.4045934677124023, |
| "learning_rate": 4.998223681601473e-05, |
| "loss": 2.6205, |
| "step": 278 |
| }, |
| { |
| "epoch": 11.155339805825243, |
| "grad_norm": 2.5755131244659424, |
| "learning_rate": 4.998089682880117e-05, |
| "loss": 2.5939, |
| "step": 279 |
| }, |
| { |
| "epoch": 11.194174757281553, |
| "grad_norm": 2.5768463611602783, |
| "learning_rate": 4.997950814005098e-05, |
| "loss": 2.6925, |
| "step": 280 |
| }, |
| { |
| "epoch": 11.233009708737864, |
| "grad_norm": 2.5549166202545166, |
| "learning_rate": 4.997807075247146e-05, |
| "loss": 2.6172, |
| "step": 281 |
| }, |
| { |
| "epoch": 11.271844660194175, |
| "grad_norm": 2.761068344116211, |
| "learning_rate": 4.997658466886489e-05, |
| "loss": 2.6572, |
| "step": 282 |
| }, |
| { |
| "epoch": 11.310679611650485, |
| "grad_norm": 2.5051231384277344, |
| "learning_rate": 4.9975049892128455e-05, |
| "loss": 2.6549, |
| "step": 283 |
| }, |
| { |
| "epoch": 11.349514563106796, |
| "grad_norm": 2.7434117794036865, |
| "learning_rate": 4.9973466425254286e-05, |
| "loss": 2.5632, |
| "step": 284 |
| }, |
| { |
| "epoch": 11.388349514563107, |
| "grad_norm": 2.328563928604126, |
| "learning_rate": 4.997183427132943e-05, |
| "loss": 2.5751, |
| "step": 285 |
| }, |
| { |
| "epoch": 11.427184466019417, |
| "grad_norm": 2.7668466567993164, |
| "learning_rate": 4.997015343353585e-05, |
| "loss": 2.6609, |
| "step": 286 |
| }, |
| { |
| "epoch": 11.466019417475728, |
| "grad_norm": 2.0831525325775146, |
| "learning_rate": 4.996842391515044e-05, |
| "loss": 2.6428, |
| "step": 287 |
| }, |
| { |
| "epoch": 11.504854368932039, |
| "grad_norm": 2.4443278312683105, |
| "learning_rate": 4.996664571954497e-05, |
| "loss": 2.6012, |
| "step": 288 |
| }, |
| { |
| "epoch": 11.54368932038835, |
| "grad_norm": 2.4806153774261475, |
| "learning_rate": 4.9964818850186135e-05, |
| "loss": 2.6649, |
| "step": 289 |
| }, |
| { |
| "epoch": 11.58252427184466, |
| "grad_norm": 2.539933919906616, |
| "learning_rate": 4.99629433106355e-05, |
| "loss": 2.6253, |
| "step": 290 |
| }, |
| { |
| "epoch": 11.62135922330097, |
| "grad_norm": 2.7404544353485107, |
| "learning_rate": 4.996101910454953e-05, |
| "loss": 2.6224, |
| "step": 291 |
| }, |
| { |
| "epoch": 11.660194174757281, |
| "grad_norm": 2.5377357006073, |
| "learning_rate": 4.9959046235679565e-05, |
| "loss": 2.6249, |
| "step": 292 |
| }, |
| { |
| "epoch": 11.699029126213592, |
| "grad_norm": 2.8488271236419678, |
| "learning_rate": 4.9957024707871806e-05, |
| "loss": 2.6232, |
| "step": 293 |
| }, |
| { |
| "epoch": 11.737864077669903, |
| "grad_norm": 2.4895827770233154, |
| "learning_rate": 4.9954954525067334e-05, |
| "loss": 2.5983, |
| "step": 294 |
| }, |
| { |
| "epoch": 11.776699029126213, |
| "grad_norm": 3.038975954055786, |
| "learning_rate": 4.995283569130207e-05, |
| "loss": 2.5715, |
| "step": 295 |
| }, |
| { |
| "epoch": 11.815533980582524, |
| "grad_norm": 2.674245595932007, |
| "learning_rate": 4.995066821070679e-05, |
| "loss": 2.6201, |
| "step": 296 |
| }, |
| { |
| "epoch": 11.854368932038835, |
| "grad_norm": 3.5277645587921143, |
| "learning_rate": 4.9948452087507116e-05, |
| "loss": 2.6376, |
| "step": 297 |
| }, |
| { |
| "epoch": 11.893203883495145, |
| "grad_norm": 3.0974984169006348, |
| "learning_rate": 4.994618732602349e-05, |
| "loss": 2.6268, |
| "step": 298 |
| }, |
| { |
| "epoch": 11.932038834951456, |
| "grad_norm": 2.309119462966919, |
| "learning_rate": 4.994387393067117e-05, |
| "loss": 2.5594, |
| "step": 299 |
| }, |
| { |
| "epoch": 11.970873786407767, |
| "grad_norm": 2.540464162826538, |
| "learning_rate": 4.994151190596025e-05, |
| "loss": 2.5765, |
| "step": 300 |
| }, |
| { |
| "epoch": 11.970873786407767, |
| "eval_loss": 2.9208481311798096, |
| "eval_runtime": 1.0115, |
| "eval_samples_per_second": 21.749, |
| "eval_steps_per_second": 5.932, |
| "step": 300 |
| }, |
| { |
| "epoch": 12.03883495145631, |
| "grad_norm": 5.542501449584961, |
| "learning_rate": 4.993910125649561e-05, |
| "loss": 5.1943, |
| "step": 301 |
| }, |
| { |
| "epoch": 12.077669902912621, |
| "grad_norm": 2.2998414039611816, |
| "learning_rate": 4.993664198697694e-05, |
| "loss": 2.5311, |
| "step": 302 |
| }, |
| { |
| "epoch": 12.116504854368932, |
| "grad_norm": 3.0827107429504395, |
| "learning_rate": 4.993413410219871e-05, |
| "loss": 2.5587, |
| "step": 303 |
| }, |
| { |
| "epoch": 12.155339805825243, |
| "grad_norm": 2.7742204666137695, |
| "learning_rate": 4.9931577607050175e-05, |
| "loss": 2.4549, |
| "step": 304 |
| }, |
| { |
| "epoch": 12.194174757281553, |
| "grad_norm": 2.5605695247650146, |
| "learning_rate": 4.992897250651535e-05, |
| "loss": 2.5602, |
| "step": 305 |
| }, |
| { |
| "epoch": 12.233009708737864, |
| "grad_norm": 2.8852667808532715, |
| "learning_rate": 4.992631880567301e-05, |
| "loss": 2.5069, |
| "step": 306 |
| }, |
| { |
| "epoch": 12.271844660194175, |
| "grad_norm": 3.006777048110962, |
| "learning_rate": 4.9923616509696683e-05, |
| "loss": 2.5326, |
| "step": 307 |
| }, |
| { |
| "epoch": 12.310679611650485, |
| "grad_norm": 2.1645665168762207, |
| "learning_rate": 4.9920865623854615e-05, |
| "loss": 2.4739, |
| "step": 308 |
| }, |
| { |
| "epoch": 12.349514563106796, |
| "grad_norm": 2.941042423248291, |
| "learning_rate": 4.9918066153509834e-05, |
| "loss": 2.5149, |
| "step": 309 |
| }, |
| { |
| "epoch": 12.388349514563107, |
| "grad_norm": 2.598097562789917, |
| "learning_rate": 4.991521810412002e-05, |
| "loss": 2.5214, |
| "step": 310 |
| }, |
| { |
| "epoch": 12.427184466019417, |
| "grad_norm": 2.408721446990967, |
| "learning_rate": 4.991232148123761e-05, |
| "loss": 2.4747, |
| "step": 311 |
| }, |
| { |
| "epoch": 12.466019417475728, |
| "grad_norm": 2.39508318901062, |
| "learning_rate": 4.990937629050971e-05, |
| "loss": 2.5304, |
| "step": 312 |
| }, |
| { |
| "epoch": 12.504854368932039, |
| "grad_norm": 2.9436190128326416, |
| "learning_rate": 4.990638253767812e-05, |
| "loss": 2.5046, |
| "step": 313 |
| }, |
| { |
| "epoch": 12.54368932038835, |
| "grad_norm": 2.6037611961364746, |
| "learning_rate": 4.990334022857932e-05, |
| "loss": 2.4537, |
| "step": 314 |
| }, |
| { |
| "epoch": 12.58252427184466, |
| "grad_norm": 2.892789602279663, |
| "learning_rate": 4.9900249369144434e-05, |
| "loss": 2.4817, |
| "step": 315 |
| }, |
| { |
| "epoch": 12.62135922330097, |
| "grad_norm": 2.6804611682891846, |
| "learning_rate": 4.989710996539926e-05, |
| "loss": 2.5012, |
| "step": 316 |
| }, |
| { |
| "epoch": 12.660194174757281, |
| "grad_norm": 2.458824396133423, |
| "learning_rate": 4.9893922023464236e-05, |
| "loss": 2.4661, |
| "step": 317 |
| }, |
| { |
| "epoch": 12.699029126213592, |
| "grad_norm": 2.6641952991485596, |
| "learning_rate": 4.989068554955439e-05, |
| "loss": 2.4971, |
| "step": 318 |
| }, |
| { |
| "epoch": 12.737864077669903, |
| "grad_norm": 2.421142101287842, |
| "learning_rate": 4.988740054997943e-05, |
| "loss": 2.4014, |
| "step": 319 |
| }, |
| { |
| "epoch": 12.776699029126213, |
| "grad_norm": 2.4107542037963867, |
| "learning_rate": 4.98840670311436e-05, |
| "loss": 2.4636, |
| "step": 320 |
| }, |
| { |
| "epoch": 12.815533980582524, |
| "grad_norm": 2.5701303482055664, |
| "learning_rate": 4.988068499954578e-05, |
| "loss": 2.4564, |
| "step": 321 |
| }, |
| { |
| "epoch": 12.854368932038835, |
| "grad_norm": 2.3998067378997803, |
| "learning_rate": 4.987725446177941e-05, |
| "loss": 2.4561, |
| "step": 322 |
| }, |
| { |
| "epoch": 12.893203883495145, |
| "grad_norm": 2.6888773441314697, |
| "learning_rate": 4.987377542453251e-05, |
| "loss": 2.4392, |
| "step": 323 |
| }, |
| { |
| "epoch": 12.932038834951456, |
| "grad_norm": 2.313508987426758, |
| "learning_rate": 4.987024789458762e-05, |
| "loss": 2.4438, |
| "step": 324 |
| }, |
| { |
| "epoch": 12.970873786407767, |
| "grad_norm": 2.5614566802978516, |
| "learning_rate": 4.986667187882186e-05, |
| "loss": 2.557, |
| "step": 325 |
| }, |
| { |
| "epoch": 12.970873786407767, |
| "eval_loss": 2.8833444118499756, |
| "eval_runtime": 1.0479, |
| "eval_samples_per_second": 20.995, |
| "eval_steps_per_second": 5.726, |
| "step": 325 |
| }, |
| { |
| "epoch": 13.03883495145631, |
| "grad_norm": 5.110360145568848, |
| "learning_rate": 4.9863047384206835e-05, |
| "loss": 4.9144, |
| "step": 326 |
| }, |
| { |
| "epoch": 13.077669902912621, |
| "grad_norm": 2.7373085021972656, |
| "learning_rate": 4.98593744178087e-05, |
| "loss": 2.3994, |
| "step": 327 |
| }, |
| { |
| "epoch": 13.116504854368932, |
| "grad_norm": 2.542954206466675, |
| "learning_rate": 4.985565298678809e-05, |
| "loss": 2.3535, |
| "step": 328 |
| }, |
| { |
| "epoch": 13.155339805825243, |
| "grad_norm": 2.6374223232269287, |
| "learning_rate": 4.985188309840012e-05, |
| "loss": 2.3894, |
| "step": 329 |
| }, |
| { |
| "epoch": 13.194174757281553, |
| "grad_norm": 2.541004180908203, |
| "learning_rate": 4.984806475999437e-05, |
| "loss": 2.391, |
| "step": 330 |
| }, |
| { |
| "epoch": 13.233009708737864, |
| "grad_norm": 2.6150271892547607, |
| "learning_rate": 4.984419797901491e-05, |
| "loss": 2.3927, |
| "step": 331 |
| }, |
| { |
| "epoch": 13.271844660194175, |
| "grad_norm": 2.47719144821167, |
| "learning_rate": 4.984028276300021e-05, |
| "loss": 2.3751, |
| "step": 332 |
| }, |
| { |
| "epoch": 13.310679611650485, |
| "grad_norm": 2.679882764816284, |
| "learning_rate": 4.983631911958319e-05, |
| "loss": 2.374, |
| "step": 333 |
| }, |
| { |
| "epoch": 13.349514563106796, |
| "grad_norm": 2.784619092941284, |
| "learning_rate": 4.983230705649118e-05, |
| "loss": 2.3539, |
| "step": 334 |
| }, |
| { |
| "epoch": 13.388349514563107, |
| "grad_norm": 2.188197135925293, |
| "learning_rate": 4.982824658154589e-05, |
| "loss": 2.3553, |
| "step": 335 |
| }, |
| { |
| "epoch": 13.427184466019417, |
| "grad_norm": 2.232978582382202, |
| "learning_rate": 4.982413770266342e-05, |
| "loss": 2.3389, |
| "step": 336 |
| }, |
| { |
| "epoch": 13.466019417475728, |
| "grad_norm": 2.563889980316162, |
| "learning_rate": 4.981998042785427e-05, |
| "loss": 2.3623, |
| "step": 337 |
| }, |
| { |
| "epoch": 13.504854368932039, |
| "grad_norm": 2.9053828716278076, |
| "learning_rate": 4.9815774765223226e-05, |
| "loss": 2.3705, |
| "step": 338 |
| }, |
| { |
| "epoch": 13.54368932038835, |
| "grad_norm": 2.5447866916656494, |
| "learning_rate": 4.9811520722969465e-05, |
| "loss": 2.3216, |
| "step": 339 |
| }, |
| { |
| "epoch": 13.58252427184466, |
| "grad_norm": 3.22255277633667, |
| "learning_rate": 4.9807218309386444e-05, |
| "loss": 2.3418, |
| "step": 340 |
| }, |
| { |
| "epoch": 13.62135922330097, |
| "grad_norm": 3.154477119445801, |
| "learning_rate": 4.980286753286195e-05, |
| "loss": 2.3843, |
| "step": 341 |
| }, |
| { |
| "epoch": 13.660194174757281, |
| "grad_norm": 3.3448827266693115, |
| "learning_rate": 4.979846840187804e-05, |
| "loss": 2.419, |
| "step": 342 |
| }, |
| { |
| "epoch": 13.699029126213592, |
| "grad_norm": 3.275527238845825, |
| "learning_rate": 4.9794020925011044e-05, |
| "loss": 2.3756, |
| "step": 343 |
| }, |
| { |
| "epoch": 13.737864077669903, |
| "grad_norm": 2.3320887088775635, |
| "learning_rate": 4.9789525110931545e-05, |
| "loss": 2.3201, |
| "step": 344 |
| }, |
| { |
| "epoch": 13.776699029126213, |
| "grad_norm": 2.804107427597046, |
| "learning_rate": 4.978498096840436e-05, |
| "loss": 2.3461, |
| "step": 345 |
| }, |
| { |
| "epoch": 13.815533980582524, |
| "grad_norm": 2.809633255004883, |
| "learning_rate": 4.978038850628854e-05, |
| "loss": 2.3418, |
| "step": 346 |
| }, |
| { |
| "epoch": 13.854368932038835, |
| "grad_norm": 2.9983737468719482, |
| "learning_rate": 4.977574773353732e-05, |
| "loss": 2.4238, |
| "step": 347 |
| }, |
| { |
| "epoch": 13.893203883495145, |
| "grad_norm": 2.892005443572998, |
| "learning_rate": 4.977105865919812e-05, |
| "loss": 2.4266, |
| "step": 348 |
| }, |
| { |
| "epoch": 13.932038834951456, |
| "grad_norm": 2.766019821166992, |
| "learning_rate": 4.976632129241252e-05, |
| "loss": 2.3937, |
| "step": 349 |
| }, |
| { |
| "epoch": 13.970873786407767, |
| "grad_norm": 2.5251376628875732, |
| "learning_rate": 4.976153564241628e-05, |
| "loss": 2.3557, |
| "step": 350 |
| }, |
| { |
| "epoch": 13.970873786407767, |
| "eval_loss": 2.855170965194702, |
| "eval_runtime": 1.0307, |
| "eval_samples_per_second": 21.345, |
| "eval_steps_per_second": 5.821, |
| "step": 350 |
| }, |
| { |
| "epoch": 14.03883495145631, |
| "grad_norm": 5.263445854187012, |
| "learning_rate": 4.975670171853926e-05, |
| "loss": 4.6103, |
| "step": 351 |
| }, |
| { |
| "epoch": 14.077669902912621, |
| "grad_norm": 2.6694159507751465, |
| "learning_rate": 4.975181953020544e-05, |
| "loss": 2.2714, |
| "step": 352 |
| }, |
| { |
| "epoch": 14.116504854368932, |
| "grad_norm": 3.4369680881500244, |
| "learning_rate": 4.9746889086932895e-05, |
| "loss": 2.2303, |
| "step": 353 |
| }, |
| { |
| "epoch": 14.155339805825243, |
| "grad_norm": 3.053704023361206, |
| "learning_rate": 4.974191039833378e-05, |
| "loss": 2.2659, |
| "step": 354 |
| }, |
| { |
| "epoch": 14.194174757281553, |
| "grad_norm": 2.9966983795166016, |
| "learning_rate": 4.973688347411431e-05, |
| "loss": 2.3092, |
| "step": 355 |
| }, |
| { |
| "epoch": 14.233009708737864, |
| "grad_norm": 2.965481758117676, |
| "learning_rate": 4.9731808324074717e-05, |
| "loss": 2.2537, |
| "step": 356 |
| }, |
| { |
| "epoch": 14.271844660194175, |
| "grad_norm": 2.9761455059051514, |
| "learning_rate": 4.9726684958109266e-05, |
| "loss": 2.2865, |
| "step": 357 |
| }, |
| { |
| "epoch": 14.310679611650485, |
| "grad_norm": 2.936624050140381, |
| "learning_rate": 4.972151338620623e-05, |
| "loss": 2.2589, |
| "step": 358 |
| }, |
| { |
| "epoch": 14.349514563106796, |
| "grad_norm": 3.4442408084869385, |
| "learning_rate": 4.971629361844785e-05, |
| "loss": 2.2636, |
| "step": 359 |
| }, |
| { |
| "epoch": 14.388349514563107, |
| "grad_norm": 3.0097110271453857, |
| "learning_rate": 4.971102566501034e-05, |
| "loss": 2.204, |
| "step": 360 |
| }, |
| { |
| "epoch": 14.427184466019417, |
| "grad_norm": 3.7276322841644287, |
| "learning_rate": 4.9705709536163824e-05, |
| "loss": 2.2811, |
| "step": 361 |
| }, |
| { |
| "epoch": 14.466019417475728, |
| "grad_norm": 2.8004868030548096, |
| "learning_rate": 4.970034524227238e-05, |
| "loss": 2.1964, |
| "step": 362 |
| }, |
| { |
| "epoch": 14.504854368932039, |
| "grad_norm": 3.1439263820648193, |
| "learning_rate": 4.969493279379398e-05, |
| "loss": 2.294, |
| "step": 363 |
| }, |
| { |
| "epoch": 14.54368932038835, |
| "grad_norm": 2.971735954284668, |
| "learning_rate": 4.968947220128045e-05, |
| "loss": 2.2882, |
| "step": 364 |
| }, |
| { |
| "epoch": 14.58252427184466, |
| "grad_norm": 2.860797166824341, |
| "learning_rate": 4.968396347537751e-05, |
| "loss": 2.1807, |
| "step": 365 |
| }, |
| { |
| "epoch": 14.62135922330097, |
| "grad_norm": 2.8869500160217285, |
| "learning_rate": 4.96784066268247e-05, |
| "loss": 2.267, |
| "step": 366 |
| }, |
| { |
| "epoch": 14.660194174757281, |
| "grad_norm": 3.185670852661133, |
| "learning_rate": 4.967280166645538e-05, |
| "loss": 2.2956, |
| "step": 367 |
| }, |
| { |
| "epoch": 14.699029126213592, |
| "grad_norm": 2.750898838043213, |
| "learning_rate": 4.96671486051967e-05, |
| "loss": 2.2429, |
| "step": 368 |
| }, |
| { |
| "epoch": 14.737864077669903, |
| "grad_norm": 2.690889596939087, |
| "learning_rate": 4.966144745406961e-05, |
| "loss": 2.2645, |
| "step": 369 |
| }, |
| { |
| "epoch": 14.776699029126213, |
| "grad_norm": 2.5257797241210938, |
| "learning_rate": 4.965569822418877e-05, |
| "loss": 2.1714, |
| "step": 370 |
| }, |
| { |
| "epoch": 14.815533980582524, |
| "grad_norm": 2.550966739654541, |
| "learning_rate": 4.964990092676263e-05, |
| "loss": 2.2281, |
| "step": 371 |
| }, |
| { |
| "epoch": 14.854368932038835, |
| "grad_norm": 2.6299831867218018, |
| "learning_rate": 4.964405557309328e-05, |
| "loss": 2.2925, |
| "step": 372 |
| }, |
| { |
| "epoch": 14.893203883495145, |
| "grad_norm": 2.8115315437316895, |
| "learning_rate": 4.963816217457657e-05, |
| "loss": 2.3404, |
| "step": 373 |
| }, |
| { |
| "epoch": 14.932038834951456, |
| "grad_norm": 2.646278142929077, |
| "learning_rate": 4.9632220742701965e-05, |
| "loss": 2.2326, |
| "step": 374 |
| }, |
| { |
| "epoch": 14.970873786407767, |
| "grad_norm": 2.667069435119629, |
| "learning_rate": 4.9626231289052596e-05, |
| "loss": 2.318, |
| "step": 375 |
| }, |
| { |
| "epoch": 14.970873786407767, |
| "eval_loss": 2.8448235988616943, |
| "eval_runtime": 0.9921, |
| "eval_samples_per_second": 22.176, |
| "eval_steps_per_second": 6.048, |
| "step": 375 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 100, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3395309036544000.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|