| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 30, |
| "global_step": 590, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.017094017094017096, |
| "grad_norm": 23.0, |
| "learning_rate": 0.0, |
| "loss": 0.9788, |
| "num_tokens": 25595.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.03418803418803419, |
| "grad_norm": 25.625, |
| "learning_rate": 1.6666666666666668e-07, |
| "loss": 1.108, |
| "num_tokens": 49684.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.05128205128205128, |
| "grad_norm": 25.0, |
| "learning_rate": 3.3333333333333335e-07, |
| "loss": 1.0319, |
| "num_tokens": 74064.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.06837606837606838, |
| "grad_norm": 24.25, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.9993, |
| "num_tokens": 99232.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.08547008547008547, |
| "grad_norm": 25.0, |
| "learning_rate": 6.666666666666667e-07, |
| "loss": 1.0217, |
| "num_tokens": 123515.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.10256410256410256, |
| "grad_norm": 23.625, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 1.0487, |
| "num_tokens": 149121.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.11965811965811966, |
| "grad_norm": 24.625, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.0543, |
| "num_tokens": 173222.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.13675213675213677, |
| "grad_norm": 24.5, |
| "learning_rate": 1.1666666666666668e-06, |
| "loss": 1.001, |
| "num_tokens": 198434.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.15384615384615385, |
| "grad_norm": 24.375, |
| "learning_rate": 1.3333333333333334e-06, |
| "loss": 1.0028, |
| "num_tokens": 222130.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.17094017094017094, |
| "grad_norm": 22.375, |
| "learning_rate": 1.5e-06, |
| "loss": 0.9943, |
| "num_tokens": 247232.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.18803418803418803, |
| "grad_norm": 23.0, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 1.037, |
| "num_tokens": 271615.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.20512820512820512, |
| "grad_norm": 21.625, |
| "learning_rate": 1.8333333333333333e-06, |
| "loss": 0.9606, |
| "num_tokens": 296685.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 21.25, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.9682, |
| "num_tokens": 321158.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.23931623931623933, |
| "grad_norm": 20.375, |
| "learning_rate": 2.166666666666667e-06, |
| "loss": 0.9362, |
| "num_tokens": 345732.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.2564102564102564, |
| "grad_norm": 18.125, |
| "learning_rate": 2.3333333333333336e-06, |
| "loss": 0.9439, |
| "num_tokens": 370226.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.27350427350427353, |
| "grad_norm": 15.5625, |
| "learning_rate": 2.5e-06, |
| "loss": 0.8796, |
| "num_tokens": 395659.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2905982905982906, |
| "grad_norm": 13.6875, |
| "learning_rate": 2.666666666666667e-06, |
| "loss": 0.8444, |
| "num_tokens": 421481.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 13.5, |
| "learning_rate": 2.8333333333333335e-06, |
| "loss": 0.8746, |
| "num_tokens": 445806.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.3247863247863248, |
| "grad_norm": 12.0625, |
| "learning_rate": 3e-06, |
| "loss": 0.801, |
| "num_tokens": 470449.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.3418803418803419, |
| "grad_norm": 11.4375, |
| "learning_rate": 3.1666666666666667e-06, |
| "loss": 0.7975, |
| "num_tokens": 495165.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.358974358974359, |
| "grad_norm": 10.375, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.7578, |
| "num_tokens": 520803.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.37606837606837606, |
| "grad_norm": 10.0625, |
| "learning_rate": 3.5e-06, |
| "loss": 0.7703, |
| "num_tokens": 545292.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.39316239316239315, |
| "grad_norm": 9.0, |
| "learning_rate": 3.6666666666666666e-06, |
| "loss": 0.7019, |
| "num_tokens": 570089.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.41025641025641024, |
| "grad_norm": 7.96875, |
| "learning_rate": 3.833333333333334e-06, |
| "loss": 0.6988, |
| "num_tokens": 594989.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.42735042735042733, |
| "grad_norm": 7.6875, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.7039, |
| "num_tokens": 619663.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 7.09375, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.6563, |
| "num_tokens": 644898.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.46153846153846156, |
| "grad_norm": 7.0625, |
| "learning_rate": 4.333333333333334e-06, |
| "loss": 0.653, |
| "num_tokens": 669606.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.47863247863247865, |
| "grad_norm": 6.625, |
| "learning_rate": 4.5e-06, |
| "loss": 0.6002, |
| "num_tokens": 694163.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.49572649572649574, |
| "grad_norm": 5.96875, |
| "learning_rate": 4.666666666666667e-06, |
| "loss": 0.6082, |
| "num_tokens": 720346.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 5.59375, |
| "learning_rate": 4.833333333333333e-06, |
| "loss": 0.5732, |
| "num_tokens": 744937.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "eval_loss": 0.6271482706069946, |
| "eval_num_tokens": 744937.0, |
| "eval_runtime": 3.9192, |
| "eval_samples_per_second": 336.551, |
| "eval_steps_per_second": 10.717, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.5299145299145299, |
| "grad_norm": 4.15625, |
| "learning_rate": 5e-06, |
| "loss": 0.6064, |
| "num_tokens": 770773.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.5470085470085471, |
| "grad_norm": 3.65625, |
| "learning_rate": 4.999960660162164e-06, |
| "loss": 0.55, |
| "num_tokens": 795350.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.5641025641025641, |
| "grad_norm": 3.359375, |
| "learning_rate": 4.999842641886752e-06, |
| "loss": 0.5366, |
| "num_tokens": 820624.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.5811965811965812, |
| "grad_norm": 3.734375, |
| "learning_rate": 4.9996459488880215e-06, |
| "loss": 0.572, |
| "num_tokens": 844564.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.5982905982905983, |
| "grad_norm": 3.71875, |
| "learning_rate": 4.999370587356267e-06, |
| "loss": 0.5462, |
| "num_tokens": 867822.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 3.53125, |
| "learning_rate": 4.999016565957633e-06, |
| "loss": 0.5399, |
| "num_tokens": 893203.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.6324786324786325, |
| "grad_norm": 2.921875, |
| "learning_rate": 4.998583895833834e-06, |
| "loss": 0.5476, |
| "num_tokens": 919455.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.6495726495726496, |
| "grad_norm": 2.640625, |
| "learning_rate": 4.998072590601808e-06, |
| "loss": 0.518, |
| "num_tokens": 944217.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 2.703125, |
| "learning_rate": 4.997482666353287e-06, |
| "loss": 0.57, |
| "num_tokens": 968490.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.6837606837606838, |
| "grad_norm": 2.71875, |
| "learning_rate": 4.996814141654291e-06, |
| "loss": 0.5546, |
| "num_tokens": 992306.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7008547008547008, |
| "grad_norm": 2.4375, |
| "learning_rate": 4.996067037544542e-06, |
| "loss": 0.5206, |
| "num_tokens": 1016929.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.717948717948718, |
| "grad_norm": 2.46875, |
| "learning_rate": 4.9952413775368034e-06, |
| "loss": 0.525, |
| "num_tokens": 1042218.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.7350427350427351, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.99433718761614e-06, |
| "loss": 0.5278, |
| "num_tokens": 1066745.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.7521367521367521, |
| "grad_norm": 2.3125, |
| "learning_rate": 4.993354496239101e-06, |
| "loss": 0.5256, |
| "num_tokens": 1091730.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 2.203125, |
| "learning_rate": 4.992293334332821e-06, |
| "loss": 0.5076, |
| "num_tokens": 1116183.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.7863247863247863, |
| "grad_norm": 2.15625, |
| "learning_rate": 4.9911537352940485e-06, |
| "loss": 0.5113, |
| "num_tokens": 1142542.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.8034188034188035, |
| "grad_norm": 2.203125, |
| "learning_rate": 4.989935734988098e-06, |
| "loss": 0.5558, |
| "num_tokens": 1168446.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.8205128205128205, |
| "grad_norm": 2.21875, |
| "learning_rate": 4.988639371747717e-06, |
| "loss": 0.5316, |
| "num_tokens": 1193788.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.8376068376068376, |
| "grad_norm": 2.125, |
| "learning_rate": 4.987264686371881e-06, |
| "loss": 0.5293, |
| "num_tokens": 1218441.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.8547008547008547, |
| "grad_norm": 2.359375, |
| "learning_rate": 4.98581172212451e-06, |
| "loss": 0.5493, |
| "num_tokens": 1242720.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.8717948717948718, |
| "grad_norm": 2.125, |
| "learning_rate": 4.984280524733107e-06, |
| "loss": 0.5171, |
| "num_tokens": 1268282.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.982671142387316e-06, |
| "loss": 0.5173, |
| "num_tokens": 1292780.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.905982905982906, |
| "grad_norm": 2.09375, |
| "learning_rate": 4.980983625737411e-06, |
| "loss": 0.5292, |
| "num_tokens": 1318538.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.979218027892696e-06, |
| "loss": 0.5264, |
| "num_tokens": 1343366.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.9401709401709402, |
| "grad_norm": 2.0, |
| "learning_rate": 4.977374404419838e-06, |
| "loss": 0.5078, |
| "num_tokens": 1368198.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.9572649572649573, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.9754528133411144e-06, |
| "loss": 0.503, |
| "num_tokens": 1394723.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.9743589743589743, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.973453315132592e-06, |
| "loss": 0.5035, |
| "num_tokens": 1419991.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.9914529914529915, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.9713759727222184e-06, |
| "loss": 0.527, |
| "num_tokens": 1444550.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 3.0, |
| "learning_rate": 4.9692208514878445e-06, |
| "loss": 0.5228, |
| "num_tokens": 1454461.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.017094017094017, |
| "grad_norm": 1.9296875, |
| "learning_rate": 4.966988019255167e-06, |
| "loss": 0.5265, |
| "num_tokens": 1479102.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.017094017094017, |
| "eval_loss": 0.5714334845542908, |
| "eval_num_tokens": 1479102.0, |
| "eval_runtime": 3.6744, |
| "eval_samples_per_second": 358.965, |
| "eval_steps_per_second": 11.43, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.0341880341880343, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.96467754629559e-06, |
| "loss": 0.5178, |
| "num_tokens": 1504664.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.0512820512820513, |
| "grad_norm": 1.875, |
| "learning_rate": 4.962289505324021e-06, |
| "loss": 0.4939, |
| "num_tokens": 1529770.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.0683760683760684, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.959823971496575e-06, |
| "loss": 0.5263, |
| "num_tokens": 1554337.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.0854700854700854, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.957281022408212e-06, |
| "loss": 0.5433, |
| "num_tokens": 1579659.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.1025641025641026, |
| "grad_norm": 1.984375, |
| "learning_rate": 4.954660738090297e-06, |
| "loss": 0.5237, |
| "num_tokens": 1605926.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.1196581196581197, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.9519632010080765e-06, |
| "loss": 0.5467, |
| "num_tokens": 1631620.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.1367521367521367, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.949188496058089e-06, |
| "loss": 0.5179, |
| "num_tokens": 1656811.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.1538461538461537, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.946336710565489e-06, |
| "loss": 0.4956, |
| "num_tokens": 1682668.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.170940170940171, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.943407934281298e-06, |
| "loss": 0.4481, |
| "num_tokens": 1707713.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.188034188034188, |
| "grad_norm": 1.9921875, |
| "learning_rate": 4.940402259379585e-06, |
| "loss": 0.4895, |
| "num_tokens": 1732157.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.205128205128205, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.937319780454559e-06, |
| "loss": 0.5289, |
| "num_tokens": 1757982.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 1.9609375, |
| "learning_rate": 4.934160594517598e-06, |
| "loss": 0.5181, |
| "num_tokens": 1782772.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.2393162393162394, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.930924800994192e-06, |
| "loss": 0.5355, |
| "num_tokens": 1807558.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.2564102564102564, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.9276125017208144e-06, |
| "loss": 0.5314, |
| "num_tokens": 1831248.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.2735042735042734, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.924223800941718e-06, |
| "loss": 0.4989, |
| "num_tokens": 1855690.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.2905982905982907, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.920758805305654e-06, |
| "loss": 0.5124, |
| "num_tokens": 1879982.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.3076923076923077, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.917217623862516e-06, |
| "loss": 0.4972, |
| "num_tokens": 1903772.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.3247863247863247, |
| "grad_norm": 1.875, |
| "learning_rate": 4.913600368059908e-06, |
| "loss": 0.5096, |
| "num_tokens": 1928519.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.341880341880342, |
| "grad_norm": 1.953125, |
| "learning_rate": 4.909907151739634e-06, |
| "loss": 0.5159, |
| "num_tokens": 1953015.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.358974358974359, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.906138091134118e-06, |
| "loss": 0.4839, |
| "num_tokens": 1977983.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.376068376068376, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.9022933048627496e-06, |
| "loss": 0.5061, |
| "num_tokens": 2002217.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.393162393162393, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.89837291392814e-06, |
| "loss": 0.4503, |
| "num_tokens": 2026080.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.4102564102564101, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.894377041712327e-06, |
| "loss": 0.4999, |
| "num_tokens": 2051323.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.4273504273504274, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.89030581397288e-06, |
| "loss": 0.504, |
| "num_tokens": 2077546.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.886159358838952e-06, |
| "loss": 0.4974, |
| "num_tokens": 2102220.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.4615384615384617, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.881937806807241e-06, |
| "loss": 0.4785, |
| "num_tokens": 2126431.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.4786324786324787, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.5217, |
| "num_tokens": 2150716.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.4957264957264957, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.873269945850279e-06, |
| "loss": 0.5251, |
| "num_tokens": 2175905.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.5128205128205128, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.868823909718823e-06, |
| "loss": 0.5125, |
| "num_tokens": 2201148.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.5299145299145298, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.864303322268588e-06, |
| "loss": 0.4781, |
| "num_tokens": 2225118.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.5299145299145298, |
| "eval_loss": 0.5624352097511292, |
| "eval_num_tokens": 2225118.0, |
| "eval_runtime": 3.7185, |
| "eval_samples_per_second": 354.711, |
| "eval_steps_per_second": 11.295, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.547008547008547, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.859708325770919e-06, |
| "loss": 0.4883, |
| "num_tokens": 2250080.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.564102564102564, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.8550390648389475e-06, |
| "loss": 0.5148, |
| "num_tokens": 2276542.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.5811965811965814, |
| "grad_norm": 1.9140625, |
| "learning_rate": 4.850295686423048e-06, |
| "loss": 0.5161, |
| "num_tokens": 2301549.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.5982905982905984, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.845478339806211e-06, |
| "loss": 0.4967, |
| "num_tokens": 2326231.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.6153846153846154, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.8405871765993435e-06, |
| "loss": 0.5131, |
| "num_tokens": 2350342.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.6324786324786325, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.835622350736499e-06, |
| "loss": 0.4803, |
| "num_tokens": 2374576.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.6495726495726495, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.830584018470036e-06, |
| "loss": 0.4972, |
| "num_tokens": 2401193.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.825472338365691e-06, |
| "loss": 0.498, |
| "num_tokens": 2426218.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.6837606837606838, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.820287471297598e-06, |
| "loss": 0.4849, |
| "num_tokens": 2450946.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.7008547008547008, |
| "grad_norm": 2.0625, |
| "learning_rate": 4.81502958044322e-06, |
| "loss": 0.5148, |
| "num_tokens": 2475890.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.717948717948718, |
| "grad_norm": 2.015625, |
| "learning_rate": 4.809698831278217e-06, |
| "loss": 0.5278, |
| "num_tokens": 2500942.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.735042735042735, |
| "grad_norm": 1.9140625, |
| "learning_rate": 4.8042953915712354e-06, |
| "loss": 0.473, |
| "num_tokens": 2525522.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.7521367521367521, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.7988194313786275e-06, |
| "loss": 0.4824, |
| "num_tokens": 2550057.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.7692307692307692, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.7932711230391015e-06, |
| "loss": 0.4926, |
| "num_tokens": 2574298.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.7863247863247862, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.7876506411683e-06, |
| "loss": 0.5095, |
| "num_tokens": 2599108.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.8034188034188035, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.781958162653298e-06, |
| "loss": 0.5089, |
| "num_tokens": 2624100.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.8205128205128205, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.7761938666470405e-06, |
| "loss": 0.4619, |
| "num_tokens": 2650393.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.8376068376068377, |
| "grad_norm": 1.9921875, |
| "learning_rate": 4.770357934562704e-06, |
| "loss": 0.504, |
| "num_tokens": 2675279.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.8547008547008548, |
| "grad_norm": 1.96875, |
| "learning_rate": 4.764450550067986e-06, |
| "loss": 0.4738, |
| "num_tokens": 2698998.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.8717948717948718, |
| "grad_norm": 1.9140625, |
| "learning_rate": 4.758471899079325e-06, |
| "loss": 0.5221, |
| "num_tokens": 2723560.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.752422169756048e-06, |
| "loss": 0.5061, |
| "num_tokens": 2748478.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.9059829059829059, |
| "grad_norm": 1.875, |
| "learning_rate": 4.746301552494453e-06, |
| "loss": 0.5154, |
| "num_tokens": 2774852.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 1.9296875, |
| "learning_rate": 4.740110239921813e-06, |
| "loss": 0.5106, |
| "num_tokens": 2800119.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.9401709401709402, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.7338484268903125e-06, |
| "loss": 0.4917, |
| "num_tokens": 2826077.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.9572649572649574, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.72751631047092e-06, |
| "loss": 0.5087, |
| "num_tokens": 2851371.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.9743589743589745, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.721114089947181e-06, |
| "loss": 0.4763, |
| "num_tokens": 2875284.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.9914529914529915, |
| "grad_norm": 1.9921875, |
| "learning_rate": 4.71464196680895e-06, |
| "loss": 0.5029, |
| "num_tokens": 2899750.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 3.265625, |
| "learning_rate": 4.708100144746046e-06, |
| "loss": 0.5426, |
| "num_tokens": 2908922.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.017094017094017, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.701488829641845e-06, |
| "loss": 0.4663, |
| "num_tokens": 2935424.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.034188034188034, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.6948082295667985e-06, |
| "loss": 0.4984, |
| "num_tokens": 2959733.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.034188034188034, |
| "eval_loss": 0.5578281283378601, |
| "eval_num_tokens": 2959733.0, |
| "eval_runtime": 3.753, |
| "eval_samples_per_second": 351.454, |
| "eval_steps_per_second": 11.191, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.051282051282051, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.6880585547718845e-06, |
| "loss": 0.5201, |
| "num_tokens": 2984243.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.0683760683760686, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.681240017681994e-06, |
| "loss": 0.4877, |
| "num_tokens": 3007597.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.0854700854700856, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.674352832889239e-06, |
| "loss": 0.52, |
| "num_tokens": 3031599.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.1025641025641026, |
| "grad_norm": 1.9140625, |
| "learning_rate": 4.667397217146208e-06, |
| "loss": 0.4729, |
| "num_tokens": 3055857.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.1196581196581197, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.660373389359137e-06, |
| "loss": 0.507, |
| "num_tokens": 3080352.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.1367521367521367, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.653281570581023e-06, |
| "loss": 0.4814, |
| "num_tokens": 3104247.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.1538461538461537, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.646121984004666e-06, |
| "loss": 0.462, |
| "num_tokens": 3128125.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.1709401709401708, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.638894854955645e-06, |
| "loss": 0.4944, |
| "num_tokens": 3152844.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.1880341880341883, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.631600410885231e-06, |
| "loss": 0.4977, |
| "num_tokens": 3177853.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.2051282051282053, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.624238881363219e-06, |
| "loss": 0.5246, |
| "num_tokens": 3203411.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.6168104980707105e-06, |
| "loss": 0.5069, |
| "num_tokens": 3228533.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.2393162393162394, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.609315494792823e-06, |
| "loss": 0.5008, |
| "num_tokens": 3252563.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.2564102564102564, |
| "grad_norm": 2.078125, |
| "learning_rate": 4.601754107411326e-06, |
| "loss": 0.4893, |
| "num_tokens": 3276004.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.2735042735042734, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.594126573897222e-06, |
| "loss": 0.449, |
| "num_tokens": 3300197.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.2905982905982905, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.586433134303257e-06, |
| "loss": 0.4602, |
| "num_tokens": 3324679.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 1.7734375, |
| "learning_rate": 4.578674030756364e-06, |
| "loss": 0.4809, |
| "num_tokens": 3350582.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.324786324786325, |
| "grad_norm": 1.9140625, |
| "learning_rate": 4.570849507450042e-06, |
| "loss": 0.5126, |
| "num_tokens": 3375575.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.341880341880342, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.562959810636674e-06, |
| "loss": 0.4904, |
| "num_tokens": 3401385.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.358974358974359, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.555005188619776e-06, |
| "loss": 0.4845, |
| "num_tokens": 3426459.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.376068376068376, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.546985891746177e-06, |
| "loss": 0.5121, |
| "num_tokens": 3451522.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.393162393162393, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.538902172398151e-06, |
| "loss": 0.5063, |
| "num_tokens": 3477049.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.41025641025641, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.530754284985463e-06, |
| "loss": 0.4813, |
| "num_tokens": 3501648.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.427350427350427, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.522542485937369e-06, |
| "loss": 0.4835, |
| "num_tokens": 3526588.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.4444444444444446, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.514267033694544e-06, |
| "loss": 0.48, |
| "num_tokens": 3551875.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.4615384615384617, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.505928188700946e-06, |
| "loss": 0.4868, |
| "num_tokens": 3576916.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.4786324786324787, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.4975262133956235e-06, |
| "loss": 0.4892, |
| "num_tokens": 3600882.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.4957264957264957, |
| "grad_norm": 2.046875, |
| "learning_rate": 4.4890613722044526e-06, |
| "loss": 0.4793, |
| "num_tokens": 3625564.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.5128205128205128, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.480533931531819e-06, |
| "loss": 0.4893, |
| "num_tokens": 3650390.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.52991452991453, |
| "grad_norm": 1.8515625, |
| "learning_rate": 4.471944159752228e-06, |
| "loss": 0.5143, |
| "num_tokens": 3675958.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.547008547008547, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.463292327201862e-06, |
| "loss": 0.4976, |
| "num_tokens": 3701445.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.547008547008547, |
| "eval_loss": 0.5555291175842285, |
| "eval_num_tokens": 3701445.0, |
| "eval_runtime": 3.6005, |
| "eval_samples_per_second": 366.333, |
| "eval_steps_per_second": 11.665, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.564102564102564, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.454578706170075e-06, |
| "loss": 0.4661, |
| "num_tokens": 3726919.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.5811965811965814, |
| "grad_norm": 1.953125, |
| "learning_rate": 4.445803570890815e-06, |
| "loss": 0.483, |
| "num_tokens": 3751903.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.5982905982905984, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.436967197534003e-06, |
| "loss": 0.5107, |
| "num_tokens": 3776558.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.6153846153846154, |
| "grad_norm": 1.75, |
| "learning_rate": 4.4280698641968335e-06, |
| "loss": 0.4787, |
| "num_tokens": 3802036.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.6324786324786325, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.4191118508950286e-06, |
| "loss": 0.4584, |
| "num_tokens": 3827070.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.6495726495726495, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.410093439554019e-06, |
| "loss": 0.5037, |
| "num_tokens": 3852076.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.401014914000078e-06, |
| "loss": 0.4796, |
| "num_tokens": 3877546.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.683760683760684, |
| "grad_norm": 1.671875, |
| "learning_rate": 4.391876559951383e-06, |
| "loss": 0.4525, |
| "num_tokens": 3902477.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.700854700854701, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.382678665009028e-06, |
| "loss": 0.4831, |
| "num_tokens": 3927621.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.717948717948718, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.373421518647968e-06, |
| "loss": 0.508, |
| "num_tokens": 3953135.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.735042735042735, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.364105412207914e-06, |
| "loss": 0.4923, |
| "num_tokens": 3977233.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.752136752136752, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.35473063888416e-06, |
| "loss": 0.4915, |
| "num_tokens": 4002412.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.769230769230769, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.345297493718352e-06, |
| "loss": 0.5062, |
| "num_tokens": 4027883.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.786324786324786, |
| "grad_norm": 1.7265625, |
| "learning_rate": 4.335806273589214e-06, |
| "loss": 0.4649, |
| "num_tokens": 4052975.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.8034188034188032, |
| "grad_norm": 1.90625, |
| "learning_rate": 4.326257277203194e-06, |
| "loss": 0.499, |
| "num_tokens": 4077632.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.316650805085068e-06, |
| "loss": 0.4717, |
| "num_tokens": 4100768.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.8376068376068377, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.3069871595684795e-06, |
| "loss": 0.4634, |
| "num_tokens": 4125382.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.8547008547008548, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.297266644786426e-06, |
| "loss": 0.5104, |
| "num_tokens": 4151107.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.871794871794872, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.287489566661689e-06, |
| "loss": 0.5049, |
| "num_tokens": 4175799.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.888888888888889, |
| "grad_norm": 1.75, |
| "learning_rate": 4.277656232897201e-06, |
| "loss": 0.5009, |
| "num_tokens": 4201835.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.905982905982906, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.267766952966369e-06, |
| "loss": 0.4712, |
| "num_tokens": 4226851.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.9230769230769234, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.257822038103326e-06, |
| "loss": 0.4979, |
| "num_tokens": 4252450.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.9401709401709404, |
| "grad_norm": 1.78125, |
| "learning_rate": 4.247821801293144e-06, |
| "loss": 0.4507, |
| "num_tokens": 4278416.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.9572649572649574, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.237766557261977e-06, |
| "loss": 0.4681, |
| "num_tokens": 4302908.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.9743589743589745, |
| "grad_norm": 1.734375, |
| "learning_rate": 4.227656622467162e-06, |
| "loss": 0.479, |
| "num_tokens": 4328683.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.9914529914529915, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.217492315087255e-06, |
| "loss": 0.4522, |
| "num_tokens": 4354142.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 3.09375, |
| "learning_rate": 4.207273955012018e-06, |
| "loss": 0.4855, |
| "num_tokens": 4363383.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 3.017094017094017, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.197001863832355e-06, |
| "loss": 0.4493, |
| "num_tokens": 4388966.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 3.034188034188034, |
| "grad_norm": 1.8046875, |
| "learning_rate": 4.186676364830187e-06, |
| "loss": 0.5116, |
| "num_tokens": 4414781.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 3.051282051282051, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.176297782968277e-06, |
| "loss": 0.4707, |
| "num_tokens": 4440274.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.051282051282051, |
| "eval_loss": 0.5535330772399902, |
| "eval_num_tokens": 4440274.0, |
| "eval_runtime": 3.8645, |
| "eval_samples_per_second": 341.31, |
| "eval_steps_per_second": 10.868, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.0683760683760686, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.1658664448800105e-06, |
| "loss": 0.4768, |
| "num_tokens": 4463887.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 3.0854700854700856, |
| "grad_norm": 1.7578125, |
| "learning_rate": 4.155382678859103e-06, |
| "loss": 0.4578, |
| "num_tokens": 4489299.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 3.1025641025641026, |
| "grad_norm": 1.8828125, |
| "learning_rate": 4.144846814849282e-06, |
| "loss": 0.5149, |
| "num_tokens": 4514175.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 3.1196581196581197, |
| "grad_norm": 1.9375, |
| "learning_rate": 4.134259184433891e-06, |
| "loss": 0.4797, |
| "num_tokens": 4538592.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 3.1367521367521367, |
| "grad_norm": 1.859375, |
| "learning_rate": 4.123620120825459e-06, |
| "loss": 0.5062, |
| "num_tokens": 4563767.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.1538461538461537, |
| "grad_norm": 1.921875, |
| "learning_rate": 4.11292995885522e-06, |
| "loss": 0.4816, |
| "num_tokens": 4588715.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 3.1709401709401708, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.102189034962561e-06, |
| "loss": 0.4779, |
| "num_tokens": 4613441.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 3.1880341880341883, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.091397687184447e-06, |
| "loss": 0.4652, |
| "num_tokens": 4637436.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 3.2051282051282053, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.080556255144775e-06, |
| "loss": 0.4572, |
| "num_tokens": 4661231.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 3.2222222222222223, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.069665080043687e-06, |
| "loss": 0.4529, |
| "num_tokens": 4686469.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.2393162393162394, |
| "grad_norm": 1.796875, |
| "learning_rate": 4.058724504646834e-06, |
| "loss": 0.4585, |
| "num_tokens": 4711342.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.2564102564102564, |
| "grad_norm": 1.8359375, |
| "learning_rate": 4.047734873274586e-06, |
| "loss": 0.4569, |
| "num_tokens": 4735893.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.2735042735042734, |
| "grad_norm": 1.765625, |
| "learning_rate": 4.036696531791193e-06, |
| "loss": 0.4952, |
| "num_tokens": 4761547.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.2905982905982905, |
| "grad_norm": 2.03125, |
| "learning_rate": 4.025609827593909e-06, |
| "loss": 0.463, |
| "num_tokens": 4785024.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.3076923076923075, |
| "grad_norm": 1.75, |
| "learning_rate": 4.01447510960205e-06, |
| "loss": 0.4517, |
| "num_tokens": 4809836.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.324786324786325, |
| "grad_norm": 1.8984375, |
| "learning_rate": 4.003292728246015e-06, |
| "loss": 0.4726, |
| "num_tokens": 4835379.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.341880341880342, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.9920630354562595e-06, |
| "loss": 0.4653, |
| "num_tokens": 4860091.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.358974358974359, |
| "grad_norm": 1.9140625, |
| "learning_rate": 3.9807863846522186e-06, |
| "loss": 0.4589, |
| "num_tokens": 4884680.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.376068376068376, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.969463130731183e-06, |
| "loss": 0.4956, |
| "num_tokens": 4909308.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.393162393162393, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.958093630057132e-06, |
| "loss": 0.437, |
| "num_tokens": 4933587.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.41025641025641, |
| "grad_norm": 1.828125, |
| "learning_rate": 3.946678240449515e-06, |
| "loss": 0.4809, |
| "num_tokens": 4958005.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.427350427350427, |
| "grad_norm": 1.7109375, |
| "learning_rate": 3.935217321171992e-06, |
| "loss": 0.4505, |
| "num_tokens": 4983166.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.4444444444444446, |
| "grad_norm": 2.03125, |
| "learning_rate": 3.92371123292113e-06, |
| "loss": 0.4751, |
| "num_tokens": 5008231.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.4615384615384617, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.912160337815045e-06, |
| "loss": 0.4717, |
| "num_tokens": 5033072.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.4786324786324787, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.900564999382007e-06, |
| "loss": 0.473, |
| "num_tokens": 5058720.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.4957264957264957, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.888925582549006e-06, |
| "loss": 0.5132, |
| "num_tokens": 5084491.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.5128205128205128, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.8772424536302565e-06, |
| "loss": 0.4912, |
| "num_tokens": 5108798.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.52991452991453, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.865515980315677e-06, |
| "loss": 0.4467, |
| "num_tokens": 5133691.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.547008547008547, |
| "grad_norm": 1.7734375, |
| "learning_rate": 3.853746531659315e-06, |
| "loss": 0.4636, |
| "num_tokens": 5158639.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.564102564102564, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.84193447806773e-06, |
| "loss": 0.4878, |
| "num_tokens": 5183688.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.564102564102564, |
| "eval_loss": 0.5530414581298828, |
| "eval_num_tokens": 5183688.0, |
| "eval_runtime": 3.7837, |
| "eval_samples_per_second": 348.596, |
| "eval_steps_per_second": 11.1, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.5811965811965814, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.830080191288342e-06, |
| "loss": 0.4893, |
| "num_tokens": 5208452.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.5982905982905984, |
| "grad_norm": 1.875, |
| "learning_rate": 3.8181840443977254e-06, |
| "loss": 0.5098, |
| "num_tokens": 5233187.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.6153846153846154, |
| "grad_norm": 1.75, |
| "learning_rate": 3.806246411789872e-06, |
| "loss": 0.4971, |
| "num_tokens": 5259781.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.6324786324786325, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.794267669164408e-06, |
| "loss": 0.5001, |
| "num_tokens": 5284965.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.6495726495726495, |
| "grad_norm": 1.8984375, |
| "learning_rate": 3.782248193514766e-06, |
| "loss": 0.4903, |
| "num_tokens": 5308660.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.6666666666666665, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.770188363116324e-06, |
| "loss": 0.4756, |
| "num_tokens": 5332679.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.683760683760684, |
| "grad_norm": 2.0, |
| "learning_rate": 3.758088557514501e-06, |
| "loss": 0.4933, |
| "num_tokens": 5356650.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.700854700854701, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.7459491575128076e-06, |
| "loss": 0.5105, |
| "num_tokens": 5381408.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.717948717948718, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.7337705451608676e-06, |
| "loss": 0.4813, |
| "num_tokens": 5406861.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.735042735042735, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.721553103742388e-06, |
| "loss": 0.5061, |
| "num_tokens": 5432583.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.752136752136752, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.7092972177630998e-06, |
| "loss": 0.4987, |
| "num_tokens": 5457632.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.769230769230769, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.6970032729386573e-06, |
| "loss": 0.4942, |
| "num_tokens": 5482843.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.786324786324786, |
| "grad_norm": 1.6875, |
| "learning_rate": 3.684671656182497e-06, |
| "loss": 0.451, |
| "num_tokens": 5507998.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.8034188034188032, |
| "grad_norm": 1.734375, |
| "learning_rate": 3.672302755593661e-06, |
| "loss": 0.4682, |
| "num_tokens": 5533208.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.8205128205128203, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.6598969604445854e-06, |
| "loss": 0.4823, |
| "num_tokens": 5557678.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.8376068376068377, |
| "grad_norm": 1.875, |
| "learning_rate": 3.6474546611688446e-06, |
| "loss": 0.4448, |
| "num_tokens": 5582601.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 3.8547008547008548, |
| "grad_norm": 1.921875, |
| "learning_rate": 3.634976249348867e-06, |
| "loss": 0.4716, |
| "num_tokens": 5608386.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 3.871794871794872, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.622462117703612e-06, |
| "loss": 0.4714, |
| "num_tokens": 5633161.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.6099126600762056e-06, |
| "loss": 0.5174, |
| "num_tokens": 5658917.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 3.905982905982906, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.5973282714215514e-06, |
| "loss": 0.475, |
| "num_tokens": 5684214.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.9230769230769234, |
| "grad_norm": 1.75, |
| "learning_rate": 3.5847093477938955e-06, |
| "loss": 0.4715, |
| "num_tokens": 5709024.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 3.9401709401709404, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.5720562863343668e-06, |
| "loss": 0.4783, |
| "num_tokens": 5734477.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 3.9572649572649574, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.559369485258472e-06, |
| "loss": 0.482, |
| "num_tokens": 5758325.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 3.9743589743589745, |
| "grad_norm": 1.859375, |
| "learning_rate": 3.5466493438435707e-06, |
| "loss": 0.4831, |
| "num_tokens": 5783893.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 3.9914529914529915, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.533896262416302e-06, |
| "loss": 0.4902, |
| "num_tokens": 5809115.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 3.03125, |
| "learning_rate": 3.521110642339991e-06, |
| "loss": 0.4344, |
| "num_tokens": 5817844.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 4.017094017094017, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.508292886002013e-06, |
| "loss": 0.4787, |
| "num_tokens": 5843069.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 4.034188034188034, |
| "grad_norm": 1.96875, |
| "learning_rate": 3.495443396801134e-06, |
| "loss": 0.4796, |
| "num_tokens": 5866440.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 4.051282051282051, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.4825625791348093e-06, |
| "loss": 0.4859, |
| "num_tokens": 5891282.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 4.068376068376068, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.4696508383864636e-06, |
| "loss": 0.4602, |
| "num_tokens": 5915722.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.068376068376068, |
| "eval_loss": 0.55167156457901, |
| "eval_num_tokens": 5915722.0, |
| "eval_runtime": 3.8113, |
| "eval_samples_per_second": 346.076, |
| "eval_steps_per_second": 11.02, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.085470085470085, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.4567085809127247e-06, |
| "loss": 0.4978, |
| "num_tokens": 5940349.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 4.102564102564102, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.4437362140306423e-06, |
| "loss": 0.4968, |
| "num_tokens": 5966103.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 4.119658119658119, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.4307341460048633e-06, |
| "loss": 0.4989, |
| "num_tokens": 5991498.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 4.136752136752137, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.417702786034786e-06, |
| "loss": 0.4921, |
| "num_tokens": 6017439.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 4.153846153846154, |
| "grad_norm": 1.859375, |
| "learning_rate": 3.4046425442416807e-06, |
| "loss": 0.4648, |
| "num_tokens": 6042023.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.170940170940171, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.391553831655783e-06, |
| "loss": 0.5102, |
| "num_tokens": 6067004.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 4.188034188034188, |
| "grad_norm": 1.953125, |
| "learning_rate": 3.3784370602033572e-06, |
| "loss": 0.4846, |
| "num_tokens": 6090221.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 4.205128205128205, |
| "grad_norm": 1.9453125, |
| "learning_rate": 3.3652926426937327e-06, |
| "loss": 0.5134, |
| "num_tokens": 6115452.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 4.222222222222222, |
| "grad_norm": 1.7421875, |
| "learning_rate": 3.3521209928063127e-06, |
| "loss": 0.4557, |
| "num_tokens": 6140900.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 4.239316239316239, |
| "grad_norm": 1.8125, |
| "learning_rate": 3.3389225250775533e-06, |
| "loss": 0.4488, |
| "num_tokens": 6166140.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.256410256410256, |
| "grad_norm": 1.921875, |
| "learning_rate": 3.3256976548879183e-06, |
| "loss": 0.4794, |
| "num_tokens": 6191157.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 4.273504273504273, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.3124467984488067e-06, |
| "loss": 0.4485, |
| "num_tokens": 6216180.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 4.2905982905982905, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.299170372789454e-06, |
| "loss": 0.4309, |
| "num_tokens": 6240305.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 4.3076923076923075, |
| "grad_norm": 1.7265625, |
| "learning_rate": 3.2858687957438056e-06, |
| "loss": 0.464, |
| "num_tokens": 6266444.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 4.3247863247863245, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 0.4926, |
| "num_tokens": 6290807.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.3418803418803416, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.259191862774037e-06, |
| "loss": 0.4686, |
| "num_tokens": 6315562.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 4.358974358974359, |
| "grad_norm": 1.875, |
| "learning_rate": 3.2458173464228905e-06, |
| "loss": 0.494, |
| "num_tokens": 6340242.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 4.3760683760683765, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.2324193578049727e-06, |
| "loss": 0.4891, |
| "num_tokens": 6364338.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 4.3931623931623935, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.218998318580043e-06, |
| "loss": 0.488, |
| "num_tokens": 6389547.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 4.410256410256411, |
| "grad_norm": 1.765625, |
| "learning_rate": 3.205554651133308e-06, |
| "loss": 0.465, |
| "num_tokens": 6414507.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.427350427350428, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.1920887785621233e-06, |
| "loss": 0.4636, |
| "num_tokens": 6439551.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 1.9140625, |
| "learning_rate": 3.1786011246626858e-06, |
| "loss": 0.4931, |
| "num_tokens": 6464477.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 4.461538461538462, |
| "grad_norm": 1.921875, |
| "learning_rate": 3.165092113916688e-06, |
| "loss": 0.4377, |
| "num_tokens": 6488087.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 4.478632478632479, |
| "grad_norm": 1.984375, |
| "learning_rate": 3.151562171477964e-06, |
| "loss": 0.5193, |
| "num_tokens": 6512609.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 4.495726495726496, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.138011723159107e-06, |
| "loss": 0.4167, |
| "num_tokens": 6536550.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 4.512820512820513, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.1244411954180677e-06, |
| "loss": 0.4742, |
| "num_tokens": 6562337.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 4.52991452991453, |
| "grad_norm": 1.796875, |
| "learning_rate": 3.1108510153447352e-06, |
| "loss": 0.4381, |
| "num_tokens": 6587473.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 4.547008547008547, |
| "grad_norm": 1.9453125, |
| "learning_rate": 3.0972416106474946e-06, |
| "loss": 0.4589, |
| "num_tokens": 6611709.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 4.564102564102564, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.0836134096397642e-06, |
| "loss": 0.4719, |
| "num_tokens": 6637580.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 4.581196581196581, |
| "grad_norm": 1.9375, |
| "learning_rate": 3.0699668412265175e-06, |
| "loss": 0.4721, |
| "num_tokens": 6662574.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.581196581196581, |
| "eval_loss": 0.5516491532325745, |
| "eval_num_tokens": 6662574.0, |
| "eval_runtime": 3.7149, |
| "eval_samples_per_second": 355.054, |
| "eval_steps_per_second": 11.306, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.598290598290598, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.056302334890786e-06, |
| "loss": 0.4657, |
| "num_tokens": 6687298.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 4.615384615384615, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.0426203206801407e-06, |
| "loss": 0.4967, |
| "num_tokens": 6712463.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 4.632478632478632, |
| "grad_norm": 1.71875, |
| "learning_rate": 3.0289212291931576e-06, |
| "loss": 0.447, |
| "num_tokens": 6739418.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 4.64957264957265, |
| "grad_norm": 1.7890625, |
| "learning_rate": 3.0152054915658664e-06, |
| "loss": 0.4805, |
| "num_tokens": 6765453.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 4.666666666666667, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.0014735394581824e-06, |
| "loss": 0.4596, |
| "num_tokens": 6791101.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 4.683760683760684, |
| "grad_norm": 1.875, |
| "learning_rate": 2.9877258050403214e-06, |
| "loss": 0.4765, |
| "num_tokens": 6815841.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 4.700854700854701, |
| "grad_norm": 1.9453125, |
| "learning_rate": 2.9739627209791965e-06, |
| "loss": 0.4836, |
| "num_tokens": 6840193.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 4.717948717948718, |
| "grad_norm": 1.96875, |
| "learning_rate": 2.9601847204248045e-06, |
| "loss": 0.4644, |
| "num_tokens": 6864869.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 4.735042735042735, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.946392236996592e-06, |
| "loss": 0.4973, |
| "num_tokens": 6889912.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 4.752136752136752, |
| "grad_norm": 1.875, |
| "learning_rate": 2.932585704769807e-06, |
| "loss": 0.464, |
| "num_tokens": 6913871.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 4.769230769230769, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.9187655582618413e-06, |
| "loss": 0.4396, |
| "num_tokens": 6938132.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 4.786324786324786, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.9049322324185524e-06, |
| "loss": 0.4538, |
| "num_tokens": 6963411.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 4.803418803418803, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.8910861626005774e-06, |
| "loss": 0.4609, |
| "num_tokens": 6988757.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 4.82051282051282, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.877227784569629e-06, |
| "loss": 0.4293, |
| "num_tokens": 7013231.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 4.837606837606837, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.863357534474782e-06, |
| "loss": 0.4561, |
| "num_tokens": 7038366.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 4.854700854700854, |
| "grad_norm": 1.7265625, |
| "learning_rate": 2.849475848838749e-06, |
| "loss": 0.4761, |
| "num_tokens": 7063893.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 4.871794871794872, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.835583164544139e-06, |
| "loss": 0.4588, |
| "num_tokens": 7089629.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 4.888888888888889, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.8216799188197096e-06, |
| "loss": 0.4962, |
| "num_tokens": 7114684.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 4.905982905982906, |
| "grad_norm": 2.015625, |
| "learning_rate": 2.8077665492266077e-06, |
| "loss": 0.4569, |
| "num_tokens": 7138648.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 4.923076923076923, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.7938434936445946e-06, |
| "loss": 0.4669, |
| "num_tokens": 7163715.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 4.94017094017094, |
| "grad_norm": 1.9375, |
| "learning_rate": 2.7799111902582697e-06, |
| "loss": 0.4888, |
| "num_tokens": 7188583.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 4.957264957264957, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.7659700775432785e-06, |
| "loss": 0.4718, |
| "num_tokens": 7213949.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 4.9743589743589745, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.752020594252511e-06, |
| "loss": 0.4759, |
| "num_tokens": 7238071.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 4.9914529914529915, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.738063179402297e-06, |
| "loss": 0.4706, |
| "num_tokens": 7263181.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 3.09375, |
| "learning_rate": 2.724098272258584e-06, |
| "loss": 0.4948, |
| "num_tokens": 7272305.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 5.017094017094017, |
| "grad_norm": 1.78125, |
| "learning_rate": 2.710126312323119e-06, |
| "loss": 0.4636, |
| "num_tokens": 7296700.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 5.034188034188034, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.696147739319613e-06, |
| "loss": 0.4625, |
| "num_tokens": 7322323.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 5.051282051282051, |
| "grad_norm": 1.828125, |
| "learning_rate": 2.6821629931799007e-06, |
| "loss": 0.4597, |
| "num_tokens": 7346129.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 5.068376068376068, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.6681725140300995e-06, |
| "loss": 0.4551, |
| "num_tokens": 7370889.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 5.085470085470085, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.654176742176754e-06, |
| "loss": 0.4572, |
| "num_tokens": 7395404.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.085470085470085, |
| "eval_loss": 0.5508914589881897, |
| "eval_num_tokens": 7395404.0, |
| "eval_runtime": 3.6111, |
| "eval_samples_per_second": 365.259, |
| "eval_steps_per_second": 11.631, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.102564102564102, |
| "grad_norm": 1.734375, |
| "learning_rate": 2.6401761180929798e-06, |
| "loss": 0.4685, |
| "num_tokens": 7421156.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 5.119658119658119, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.626171082404602e-06, |
| "loss": 0.456, |
| "num_tokens": 7445126.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 5.136752136752137, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.6121620758762877e-06, |
| "loss": 0.466, |
| "num_tokens": 7470250.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 5.153846153846154, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.5981495393976718e-06, |
| "loss": 0.493, |
| "num_tokens": 7495266.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 5.170940170940171, |
| "grad_norm": 1.75, |
| "learning_rate": 2.5841339139694856e-06, |
| "loss": 0.4603, |
| "num_tokens": 7521185.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 5.188034188034188, |
| "grad_norm": 1.765625, |
| "learning_rate": 2.5701156406896726e-06, |
| "loss": 0.4628, |
| "num_tokens": 7546972.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 5.205128205128205, |
| "grad_norm": 1.875, |
| "learning_rate": 2.556095160739513e-06, |
| "loss": 0.4763, |
| "num_tokens": 7572102.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 5.222222222222222, |
| "grad_norm": 1.9765625, |
| "learning_rate": 2.542072915369731e-06, |
| "loss": 0.483, |
| "num_tokens": 7595823.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 5.239316239316239, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.528049345886615e-06, |
| "loss": 0.4536, |
| "num_tokens": 7621019.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 5.256410256410256, |
| "grad_norm": 1.8125, |
| "learning_rate": 2.5140248936381245e-06, |
| "loss": 0.4774, |
| "num_tokens": 7645327.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.273504273504273, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.5e-06, |
| "loss": 0.4717, |
| "num_tokens": 7670908.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 5.2905982905982905, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.4859751063618763e-06, |
| "loss": 0.4603, |
| "num_tokens": 7696041.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 5.3076923076923075, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.4719506541133857e-06, |
| "loss": 0.4505, |
| "num_tokens": 7721184.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 5.3247863247863245, |
| "grad_norm": 1.9921875, |
| "learning_rate": 2.45792708463027e-06, |
| "loss": 0.5045, |
| "num_tokens": 7746578.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 5.3418803418803416, |
| "grad_norm": 1.8515625, |
| "learning_rate": 2.443904839260488e-06, |
| "loss": 0.483, |
| "num_tokens": 7772741.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 5.358974358974359, |
| "grad_norm": 1.953125, |
| "learning_rate": 2.429884359310328e-06, |
| "loss": 0.4507, |
| "num_tokens": 7796809.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 5.3760683760683765, |
| "grad_norm": 1.8515625, |
| "learning_rate": 2.415866086030516e-06, |
| "loss": 0.5084, |
| "num_tokens": 7823222.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 5.3931623931623935, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.4018504606023295e-06, |
| "loss": 0.4656, |
| "num_tokens": 7847421.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 5.410256410256411, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.3878379241237136e-06, |
| "loss": 0.4679, |
| "num_tokens": 7872182.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 5.427350427350428, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.373828917595398e-06, |
| "loss": 0.4579, |
| "num_tokens": 7897776.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 5.444444444444445, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.3598238819070206e-06, |
| "loss": 0.4579, |
| "num_tokens": 7922850.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 5.461538461538462, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.345823257823246e-06, |
| "loss": 0.4629, |
| "num_tokens": 7947751.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 5.478632478632479, |
| "grad_norm": 1.8125, |
| "learning_rate": 2.331827485969901e-06, |
| "loss": 0.4474, |
| "num_tokens": 7971691.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 5.495726495726496, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.3178370068201e-06, |
| "loss": 0.4859, |
| "num_tokens": 7996117.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 5.512820512820513, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.3038522606803882e-06, |
| "loss": 0.504, |
| "num_tokens": 8022056.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 5.52991452991453, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.2898736876768816e-06, |
| "loss": 0.4544, |
| "num_tokens": 8048659.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 5.547008547008547, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.2759017277414165e-06, |
| "loss": 0.4736, |
| "num_tokens": 8072668.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 5.564102564102564, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.2619368205977038e-06, |
| "loss": 0.4115, |
| "num_tokens": 8096796.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 5.581196581196581, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.2479794057474893e-06, |
| "loss": 0.4616, |
| "num_tokens": 8121830.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 5.598290598290598, |
| "grad_norm": 1.9609375, |
| "learning_rate": 2.234029922456722e-06, |
| "loss": 0.4777, |
| "num_tokens": 8145954.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.598290598290598, |
| "eval_loss": 0.5508424043655396, |
| "eval_num_tokens": 8145954.0, |
| "eval_runtime": 3.7263, |
| "eval_samples_per_second": 353.969, |
| "eval_steps_per_second": 11.271, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.615384615384615, |
| "grad_norm": 1.875, |
| "learning_rate": 2.2200888097417308e-06, |
| "loss": 0.4598, |
| "num_tokens": 8169934.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 5.632478632478632, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.2061565063554063e-06, |
| "loss": 0.4541, |
| "num_tokens": 8194695.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 5.64957264957265, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.192233450773393e-06, |
| "loss": 0.5074, |
| "num_tokens": 8220946.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 5.666666666666667, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.178320081180291e-06, |
| "loss": 0.4731, |
| "num_tokens": 8245010.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 5.683760683760684, |
| "grad_norm": 1.8359375, |
| "learning_rate": 2.1644168354558623e-06, |
| "loss": 0.48, |
| "num_tokens": 8269142.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 5.700854700854701, |
| "grad_norm": 1.8671875, |
| "learning_rate": 2.1505241511612522e-06, |
| "loss": 0.4469, |
| "num_tokens": 8293617.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 5.717948717948718, |
| "grad_norm": 1.9609375, |
| "learning_rate": 2.136642465525219e-06, |
| "loss": 0.4769, |
| "num_tokens": 8317882.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 5.735042735042735, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.1227722154303714e-06, |
| "loss": 0.4728, |
| "num_tokens": 8342847.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 5.752136752136752, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.1089138373994226e-06, |
| "loss": 0.4697, |
| "num_tokens": 8367548.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 1.875, |
| "learning_rate": 2.095067767581447e-06, |
| "loss": 0.4634, |
| "num_tokens": 8392538.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 5.786324786324786, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.0812344417381595e-06, |
| "loss": 0.4732, |
| "num_tokens": 8417232.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 5.803418803418803, |
| "grad_norm": 1.796875, |
| "learning_rate": 2.0674142952301932e-06, |
| "loss": 0.4754, |
| "num_tokens": 8442838.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 5.82051282051282, |
| "grad_norm": 1.6875, |
| "learning_rate": 2.053607763003409e-06, |
| "loss": 0.4584, |
| "num_tokens": 8468492.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 5.837606837606837, |
| "grad_norm": 1.8125, |
| "learning_rate": 2.039815279575196e-06, |
| "loss": 0.4924, |
| "num_tokens": 8494007.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 5.854700854700854, |
| "grad_norm": 1.875, |
| "learning_rate": 2.026037279020804e-06, |
| "loss": 0.468, |
| "num_tokens": 8519316.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 5.871794871794872, |
| "grad_norm": 1.9375, |
| "learning_rate": 2.01227419495968e-06, |
| "loss": 0.4625, |
| "num_tokens": 8543710.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 5.888888888888889, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.9985264605418185e-06, |
| "loss": 0.465, |
| "num_tokens": 8568173.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 5.905982905982906, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.9847945084341345e-06, |
| "loss": 0.4927, |
| "num_tokens": 8593427.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 5.923076923076923, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.9710787708068433e-06, |
| "loss": 0.4748, |
| "num_tokens": 8618045.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 5.94017094017094, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.9573796793198597e-06, |
| "loss": 0.4403, |
| "num_tokens": 8643396.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 5.957264957264957, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.9436976651092143e-06, |
| "loss": 0.4478, |
| "num_tokens": 8667537.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 5.9743589743589745, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.9300331587734838e-06, |
| "loss": 0.4519, |
| "num_tokens": 8691813.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 5.9914529914529915, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.9163865903602374e-06, |
| "loss": 0.4919, |
| "num_tokens": 8717266.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.9027583893525067e-06, |
| "loss": 0.4946, |
| "num_tokens": 8726766.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 6.017094017094017, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.8891489846552645e-06, |
| "loss": 0.4804, |
| "num_tokens": 8751647.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 6.034188034188034, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.8755588045819325e-06, |
| "loss": 0.456, |
| "num_tokens": 8776752.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 6.051282051282051, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.8619882768408936e-06, |
| "loss": 0.4773, |
| "num_tokens": 8801564.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 6.068376068376068, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.8484378285220367e-06, |
| "loss": 0.4717, |
| "num_tokens": 8826820.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 6.085470085470085, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.8349078860833125e-06, |
| "loss": 0.4486, |
| "num_tokens": 8852240.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 6.102564102564102, |
| "grad_norm": 1.875, |
| "learning_rate": 1.8213988753373147e-06, |
| "loss": 0.4355, |
| "num_tokens": 8876431.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.102564102564102, |
| "eval_loss": 0.5509193539619446, |
| "eval_num_tokens": 8876431.0, |
| "eval_runtime": 3.8061, |
| "eval_samples_per_second": 346.55, |
| "eval_steps_per_second": 11.035, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.119658119658119, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.8079112214378769e-06, |
| "loss": 0.4951, |
| "num_tokens": 8902069.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 6.136752136752137, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.7944453488666929e-06, |
| "loss": 0.4818, |
| "num_tokens": 8927782.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 6.153846153846154, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.781001681419957e-06, |
| "loss": 0.4467, |
| "num_tokens": 8953475.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 6.170940170940171, |
| "grad_norm": 1.796875, |
| "learning_rate": 1.7675806421950278e-06, |
| "loss": 0.4827, |
| "num_tokens": 8978643.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 6.188034188034188, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.75418265357711e-06, |
| "loss": 0.4599, |
| "num_tokens": 9004133.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 6.205128205128205, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.7408081372259633e-06, |
| "loss": 0.4838, |
| "num_tokens": 9029609.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 6.222222222222222, |
| "grad_norm": 1.9765625, |
| "learning_rate": 1.7274575140626318e-06, |
| "loss": 0.469, |
| "num_tokens": 9053446.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 6.239316239316239, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.714131204256195e-06, |
| "loss": 0.4354, |
| "num_tokens": 9078936.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 6.256410256410256, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.7008296272105469e-06, |
| "loss": 0.4997, |
| "num_tokens": 9102790.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 6.273504273504273, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.6875532015511945e-06, |
| "loss": 0.4745, |
| "num_tokens": 9127911.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.2905982905982905, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.6743023451120831e-06, |
| "loss": 0.4707, |
| "num_tokens": 9153311.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 6.3076923076923075, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.6610774749224484e-06, |
| "loss": 0.4863, |
| "num_tokens": 9177952.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 6.3247863247863245, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.6478790071936875e-06, |
| "loss": 0.4565, |
| "num_tokens": 9203024.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 6.3418803418803416, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.634707357306267e-06, |
| "loss": 0.5029, |
| "num_tokens": 9227504.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 6.358974358974359, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.6215629397966432e-06, |
| "loss": 0.4841, |
| "num_tokens": 9252329.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 6.3760683760683765, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.6084461683442176e-06, |
| "loss": 0.485, |
| "num_tokens": 9278535.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 6.3931623931623935, |
| "grad_norm": 1.9375, |
| "learning_rate": 1.5953574557583202e-06, |
| "loss": 0.4622, |
| "num_tokens": 9301964.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 6.410256410256411, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.5822972139652148e-06, |
| "loss": 0.4401, |
| "num_tokens": 9326744.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 6.427350427350428, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.5692658539951371e-06, |
| "loss": 0.4447, |
| "num_tokens": 9351391.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 6.444444444444445, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.5562637859693585e-06, |
| "loss": 0.4805, |
| "num_tokens": 9375727.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 6.461538461538462, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.5432914190872757e-06, |
| "loss": 0.4464, |
| "num_tokens": 9399774.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 6.478632478632479, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.5303491616135374e-06, |
| "loss": 0.4372, |
| "num_tokens": 9424390.0, |
| "step": 382 |
| }, |
| { |
| "epoch": 6.495726495726496, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.5174374208651913e-06, |
| "loss": 0.4291, |
| "num_tokens": 9448880.0, |
| "step": 383 |
| }, |
| { |
| "epoch": 6.512820512820513, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.5045566031988672e-06, |
| "loss": 0.4669, |
| "num_tokens": 9473258.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 6.52991452991453, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.4917071139979877e-06, |
| "loss": 0.5007, |
| "num_tokens": 9497996.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 6.547008547008547, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.47888935766001e-06, |
| "loss": 0.4926, |
| "num_tokens": 9523600.0, |
| "step": 386 |
| }, |
| { |
| "epoch": 6.564102564102564, |
| "grad_norm": 1.7578125, |
| "learning_rate": 1.466103737583699e-06, |
| "loss": 0.4558, |
| "num_tokens": 9548721.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 6.581196581196581, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.4533506561564305e-06, |
| "loss": 0.4606, |
| "num_tokens": 9573873.0, |
| "step": 388 |
| }, |
| { |
| "epoch": 6.598290598290598, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.4406305147415284e-06, |
| "loss": 0.4623, |
| "num_tokens": 9598689.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 6.615384615384615, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.4279437136656338e-06, |
| "loss": 0.4886, |
| "num_tokens": 9624643.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.615384615384615, |
| "eval_loss": 0.5507991313934326, |
| "eval_num_tokens": 9624643.0, |
| "eval_runtime": 3.8292, |
| "eval_samples_per_second": 344.459, |
| "eval_steps_per_second": 10.968, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.632478632478632, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.415290652206105e-06, |
| "loss": 0.4754, |
| "num_tokens": 9650106.0, |
| "step": 391 |
| }, |
| { |
| "epoch": 6.64957264957265, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.402671728578449e-06, |
| "loss": 0.4861, |
| "num_tokens": 9674783.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 1.8125, |
| "learning_rate": 1.3900873399237953e-06, |
| "loss": 0.4472, |
| "num_tokens": 9699648.0, |
| "step": 393 |
| }, |
| { |
| "epoch": 6.683760683760684, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.3775378822963884e-06, |
| "loss": 0.4676, |
| "num_tokens": 9723820.0, |
| "step": 394 |
| }, |
| { |
| "epoch": 6.700854700854701, |
| "grad_norm": 1.921875, |
| "learning_rate": 1.3650237506511333e-06, |
| "loss": 0.4269, |
| "num_tokens": 9747820.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 6.717948717948718, |
| "grad_norm": 1.875, |
| "learning_rate": 1.3525453388311554e-06, |
| "loss": 0.4732, |
| "num_tokens": 9773075.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 6.735042735042735, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.3401030395554152e-06, |
| "loss": 0.4903, |
| "num_tokens": 9797797.0, |
| "step": 397 |
| }, |
| { |
| "epoch": 6.752136752136752, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.3276972444063386e-06, |
| "loss": 0.4503, |
| "num_tokens": 9822635.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 6.769230769230769, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.3153283438175036e-06, |
| "loss": 0.4821, |
| "num_tokens": 9847708.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 6.786324786324786, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.3029967270613435e-06, |
| "loss": 0.4617, |
| "num_tokens": 9871866.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 6.803418803418803, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.2907027822369006e-06, |
| "loss": 0.4251, |
| "num_tokens": 9895958.0, |
| "step": 401 |
| }, |
| { |
| "epoch": 6.82051282051282, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.2784468962576136e-06, |
| "loss": 0.4433, |
| "num_tokens": 9920177.0, |
| "step": 402 |
| }, |
| { |
| "epoch": 6.837606837606837, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.266229454839133e-06, |
| "loss": 0.4532, |
| "num_tokens": 9945727.0, |
| "step": 403 |
| }, |
| { |
| "epoch": 6.854700854700854, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.2540508424871934e-06, |
| "loss": 0.4492, |
| "num_tokens": 9970657.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 6.871794871794872, |
| "grad_norm": 1.9296875, |
| "learning_rate": 1.2419114424855e-06, |
| "loss": 0.492, |
| "num_tokens": 9995805.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 6.888888888888889, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.2298116368836772e-06, |
| "loss": 0.4824, |
| "num_tokens": 10020922.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 6.905982905982906, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.217751806485235e-06, |
| "loss": 0.4254, |
| "num_tokens": 10046307.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 6.923076923076923, |
| "grad_norm": 1.875, |
| "learning_rate": 1.2057323308355922e-06, |
| "loss": 0.4682, |
| "num_tokens": 10071470.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 6.94017094017094, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.193753588210128e-06, |
| "loss": 0.4738, |
| "num_tokens": 10096698.0, |
| "step": 409 |
| }, |
| { |
| "epoch": 6.957264957264957, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.1818159556022748e-06, |
| "loss": 0.4758, |
| "num_tokens": 10122020.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 6.9743589743589745, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.169919808711659e-06, |
| "loss": 0.4722, |
| "num_tokens": 10146447.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 6.9914529914529915, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.15806552193227e-06, |
| "loss": 0.4545, |
| "num_tokens": 10171539.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 3.140625, |
| "learning_rate": 1.1462534683406859e-06, |
| "loss": 0.5255, |
| "num_tokens": 10181227.0, |
| "step": 413 |
| }, |
| { |
| "epoch": 7.017094017094017, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.1344840196843228e-06, |
| "loss": 0.4529, |
| "num_tokens": 10206262.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 7.034188034188034, |
| "grad_norm": 1.84375, |
| "learning_rate": 1.122757546369744e-06, |
| "loss": 0.4641, |
| "num_tokens": 10230139.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 7.051282051282051, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.1110744174509952e-06, |
| "loss": 0.4764, |
| "num_tokens": 10255012.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 7.068376068376068, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.0994350006179933e-06, |
| "loss": 0.4632, |
| "num_tokens": 10280931.0, |
| "step": 417 |
| }, |
| { |
| "epoch": 7.085470085470085, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.0878396621849565e-06, |
| "loss": 0.4694, |
| "num_tokens": 10306239.0, |
| "step": 418 |
| }, |
| { |
| "epoch": 7.102564102564102, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.0762887670788702e-06, |
| "loss": 0.4779, |
| "num_tokens": 10331372.0, |
| "step": 419 |
| }, |
| { |
| "epoch": 7.119658119658119, |
| "grad_norm": 1.875, |
| "learning_rate": 1.0647826788280084e-06, |
| "loss": 0.4851, |
| "num_tokens": 10356892.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.119658119658119, |
| "eval_loss": 0.5506927967071533, |
| "eval_num_tokens": 10356892.0, |
| "eval_runtime": 3.8256, |
| "eval_samples_per_second": 344.787, |
| "eval_steps_per_second": 10.979, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.136752136752137, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.0533217595504859e-06, |
| "loss": 0.4674, |
| "num_tokens": 10381394.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 7.153846153846154, |
| "grad_norm": 1.890625, |
| "learning_rate": 1.041906369942869e-06, |
| "loss": 0.4644, |
| "num_tokens": 10405722.0, |
| "step": 422 |
| }, |
| { |
| "epoch": 7.170940170940171, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.0305368692688175e-06, |
| "loss": 0.4791, |
| "num_tokens": 10430615.0, |
| "step": 423 |
| }, |
| { |
| "epoch": 7.188034188034188, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.0192136153477825e-06, |
| "loss": 0.4578, |
| "num_tokens": 10455551.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 7.205128205128205, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.0079369645437411e-06, |
| "loss": 0.4761, |
| "num_tokens": 10480682.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.967072717539852e-07, |
| "loss": 0.481, |
| "num_tokens": 10505454.0, |
| "step": 426 |
| }, |
| { |
| "epoch": 7.239316239316239, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.855248903979505e-07, |
| "loss": 0.4992, |
| "num_tokens": 10529967.0, |
| "step": 427 |
| }, |
| { |
| "epoch": 7.256410256410256, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.743901724060905e-07, |
| "loss": 0.5156, |
| "num_tokens": 10555112.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 7.273504273504273, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.633034682088072e-07, |
| "loss": 0.4761, |
| "num_tokens": 10580728.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 7.2905982905982905, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.522651267254149e-07, |
| "loss": 0.448, |
| "num_tokens": 10605792.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 7.3076923076923075, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.412754953531664e-07, |
| "loss": 0.463, |
| "num_tokens": 10630947.0, |
| "step": 431 |
| }, |
| { |
| "epoch": 7.3247863247863245, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.303349199563131e-07, |
| "loss": 0.4274, |
| "num_tokens": 10656582.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 7.3418803418803416, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.19443744855226e-07, |
| "loss": 0.4583, |
| "num_tokens": 10682836.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 7.358974358974359, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.086023128155543e-07, |
| "loss": 0.5, |
| "num_tokens": 10707868.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 7.3760683760683765, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.978109650374398e-07, |
| "loss": 0.4611, |
| "num_tokens": 10733254.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 7.3931623931623935, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.870700411447817e-07, |
| "loss": 0.4758, |
| "num_tokens": 10757749.0, |
| "step": 436 |
| }, |
| { |
| "epoch": 7.410256410256411, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.763798791745413e-07, |
| "loss": 0.4334, |
| "num_tokens": 10782950.0, |
| "step": 437 |
| }, |
| { |
| "epoch": 7.427350427350428, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.657408155661109e-07, |
| "loss": 0.4593, |
| "num_tokens": 10806903.0, |
| "step": 438 |
| }, |
| { |
| "epoch": 7.444444444444445, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.551531851507186e-07, |
| "loss": 0.4515, |
| "num_tokens": 10831694.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 7.461538461538462, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.446173211408972e-07, |
| "loss": 0.4773, |
| "num_tokens": 10856829.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 7.478632478632479, |
| "grad_norm": 1.875, |
| "learning_rate": 8.341335551199903e-07, |
| "loss": 0.4766, |
| "num_tokens": 10880998.0, |
| "step": 441 |
| }, |
| { |
| "epoch": 7.495726495726496, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.237022170317235e-07, |
| "loss": 0.4733, |
| "num_tokens": 10906303.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 7.512820512820513, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.133236351698143e-07, |
| "loss": 0.4881, |
| "num_tokens": 10930727.0, |
| "step": 443 |
| }, |
| { |
| "epoch": 7.52991452991453, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.029981361676456e-07, |
| "loss": 0.4623, |
| "num_tokens": 10955710.0, |
| "step": 444 |
| }, |
| { |
| "epoch": 7.547008547008547, |
| "grad_norm": 1.8046875, |
| "learning_rate": 7.927260449879828e-07, |
| "loss": 0.4491, |
| "num_tokens": 10980669.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 7.564102564102564, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.825076849127458e-07, |
| "loss": 0.4894, |
| "num_tokens": 11004817.0, |
| "step": 446 |
| }, |
| { |
| "epoch": 7.581196581196581, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.723433775328385e-07, |
| "loss": 0.4662, |
| "num_tokens": 11029057.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 7.598290598290598, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.622334427380229e-07, |
| "loss": 0.4717, |
| "num_tokens": 11054362.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 7.615384615384615, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.521781987068566e-07, |
| "loss": 0.4596, |
| "num_tokens": 11079670.0, |
| "step": 449 |
| }, |
| { |
| "epoch": 7.632478632478632, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.421779618966737e-07, |
| "loss": 0.4315, |
| "num_tokens": 11104223.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.632478632478632, |
| "eval_loss": 0.5506695508956909, |
| "eval_num_tokens": 11104223.0, |
| "eval_runtime": 3.646, |
| "eval_samples_per_second": 361.764, |
| "eval_steps_per_second": 11.519, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.64957264957265, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.322330470336314e-07, |
| "loss": 0.4418, |
| "num_tokens": 11129525.0, |
| "step": 451 |
| }, |
| { |
| "epoch": 7.666666666666667, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.223437671027994e-07, |
| "loss": 0.4092, |
| "num_tokens": 11153830.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 7.683760683760684, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.125104333383117e-07, |
| "loss": 0.4523, |
| "num_tokens": 11178840.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 7.700854700854701, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.027333552135748e-07, |
| "loss": 0.4741, |
| "num_tokens": 11203539.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 7.717948717948718, |
| "grad_norm": 1.703125, |
| "learning_rate": 6.930128404315214e-07, |
| "loss": 0.4736, |
| "num_tokens": 11229499.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 7.735042735042735, |
| "grad_norm": 1.8671875, |
| "learning_rate": 6.833491949149329e-07, |
| "loss": 0.4764, |
| "num_tokens": 11254429.0, |
| "step": 456 |
| }, |
| { |
| "epoch": 7.752136752136752, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.737427227968063e-07, |
| "loss": 0.4781, |
| "num_tokens": 11279074.0, |
| "step": 457 |
| }, |
| { |
| "epoch": 7.769230769230769, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.641937264107868e-07, |
| "loss": 0.4477, |
| "num_tokens": 11303890.0, |
| "step": 458 |
| }, |
| { |
| "epoch": 7.786324786324786, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.547025062816487e-07, |
| "loss": 0.4673, |
| "num_tokens": 11328933.0, |
| "step": 459 |
| }, |
| { |
| "epoch": 7.803418803418803, |
| "grad_norm": 1.875, |
| "learning_rate": 6.452693611158412e-07, |
| "loss": 0.5019, |
| "num_tokens": 11354261.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 7.82051282051282, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.358945877920861e-07, |
| "loss": 0.4578, |
| "num_tokens": 11379618.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 7.837606837606837, |
| "grad_norm": 1.7421875, |
| "learning_rate": 6.265784813520318e-07, |
| "loss": 0.4628, |
| "num_tokens": 11405767.0, |
| "step": 462 |
| }, |
| { |
| "epoch": 7.854700854700854, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.17321334990973e-07, |
| "loss": 0.4494, |
| "num_tokens": 11430505.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 7.871794871794872, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.081234400486172e-07, |
| "loss": 0.4984, |
| "num_tokens": 11455251.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 7.888888888888889, |
| "grad_norm": 1.9296875, |
| "learning_rate": 5.989850859999227e-07, |
| "loss": 0.4737, |
| "num_tokens": 11480173.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 7.905982905982906, |
| "grad_norm": 1.84375, |
| "learning_rate": 5.899065604459814e-07, |
| "loss": 0.4607, |
| "num_tokens": 11504623.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 7.923076923076923, |
| "grad_norm": 1.7734375, |
| "learning_rate": 5.808881491049723e-07, |
| "loss": 0.4354, |
| "num_tokens": 11529494.0, |
| "step": 467 |
| }, |
| { |
| "epoch": 7.94017094017094, |
| "grad_norm": 1.8203125, |
| "learning_rate": 5.719301358031665e-07, |
| "loss": 0.4639, |
| "num_tokens": 11553965.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 7.957264957264957, |
| "grad_norm": 1.8984375, |
| "learning_rate": 5.630328024659979e-07, |
| "loss": 0.4911, |
| "num_tokens": 11578169.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 7.9743589743589745, |
| "grad_norm": 1.8515625, |
| "learning_rate": 5.541964291091856e-07, |
| "loss": 0.4318, |
| "num_tokens": 11603175.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 7.9914529914529915, |
| "grad_norm": 1.9609375, |
| "learning_rate": 5.454212938299256e-07, |
| "loss": 0.4223, |
| "num_tokens": 11626457.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 2.953125, |
| "learning_rate": 5.367076727981383e-07, |
| "loss": 0.4906, |
| "num_tokens": 11635688.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 8.017094017094017, |
| "grad_norm": 1.8515625, |
| "learning_rate": 5.280558402477726e-07, |
| "loss": 0.4614, |
| "num_tokens": 11661177.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 8.034188034188034, |
| "grad_norm": 1.875, |
| "learning_rate": 5.194660684681818e-07, |
| "loss": 0.4407, |
| "num_tokens": 11685654.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 8.051282051282051, |
| "grad_norm": 1.78125, |
| "learning_rate": 5.109386277955477e-07, |
| "loss": 0.4605, |
| "num_tokens": 11710852.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 8.068376068376068, |
| "grad_norm": 1.890625, |
| "learning_rate": 5.02473786604378e-07, |
| "loss": 0.4734, |
| "num_tokens": 11735336.0, |
| "step": 476 |
| }, |
| { |
| "epoch": 8.085470085470085, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.940718112990553e-07, |
| "loss": 0.4598, |
| "num_tokens": 11760061.0, |
| "step": 477 |
| }, |
| { |
| "epoch": 8.102564102564102, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.857329663054569e-07, |
| "loss": 0.4642, |
| "num_tokens": 11784720.0, |
| "step": 478 |
| }, |
| { |
| "epoch": 8.11965811965812, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 0.4626, |
| "num_tokens": 11809420.0, |
| "step": 479 |
| }, |
| { |
| "epoch": 8.136752136752136, |
| "grad_norm": 1.890625, |
| "learning_rate": 4.6924571501453743e-07, |
| "loss": 0.4586, |
| "num_tokens": 11833862.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.136752136752136, |
| "eval_loss": 0.5508862733840942, |
| "eval_num_tokens": 11833862.0, |
| "eval_runtime": 3.8867, |
| "eval_samples_per_second": 339.36, |
| "eval_steps_per_second": 10.806, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.153846153846153, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.610978276018496e-07, |
| "loss": 0.5026, |
| "num_tokens": 11858120.0, |
| "step": 481 |
| }, |
| { |
| "epoch": 8.17094017094017, |
| "grad_norm": 1.71875, |
| "learning_rate": 4.530141082538231e-07, |
| "loss": 0.426, |
| "num_tokens": 11883665.0, |
| "step": 482 |
| }, |
| { |
| "epoch": 8.188034188034187, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.4499481138022546e-07, |
| "loss": 0.4912, |
| "num_tokens": 11907624.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 8.205128205128204, |
| "grad_norm": 1.875, |
| "learning_rate": 4.370401893633261e-07, |
| "loss": 0.4683, |
| "num_tokens": 11931156.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 8.222222222222221, |
| "grad_norm": 1.8203125, |
| "learning_rate": 4.29150492549959e-07, |
| "loss": 0.4757, |
| "num_tokens": 11956116.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 8.239316239316238, |
| "grad_norm": 1.8671875, |
| "learning_rate": 4.2132596924363666e-07, |
| "loss": 0.4757, |
| "num_tokens": 11981019.0, |
| "step": 486 |
| }, |
| { |
| "epoch": 8.256410256410255, |
| "grad_norm": 1.8125, |
| "learning_rate": 4.1356686569674344e-07, |
| "loss": 0.4684, |
| "num_tokens": 12006158.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 8.273504273504274, |
| "grad_norm": 1.7890625, |
| "learning_rate": 4.058734261027789e-07, |
| "loss": 0.4658, |
| "num_tokens": 12031870.0, |
| "step": 488 |
| }, |
| { |
| "epoch": 8.290598290598291, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.982458925886748e-07, |
| "loss": 0.436, |
| "num_tokens": 12055810.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 8.307692307692308, |
| "grad_norm": 1.859375, |
| "learning_rate": 3.9068450520717784e-07, |
| "loss": 0.4608, |
| "num_tokens": 12080118.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 8.324786324786325, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.831895019292897e-07, |
| "loss": 0.4328, |
| "num_tokens": 12104565.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 8.341880341880342, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.757611186367824e-07, |
| "loss": 0.5016, |
| "num_tokens": 12129884.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 8.35897435897436, |
| "grad_norm": 1.859375, |
| "learning_rate": 3.683995891147696e-07, |
| "loss": 0.4719, |
| "num_tokens": 12154751.0, |
| "step": 493 |
| }, |
| { |
| "epoch": 8.376068376068377, |
| "grad_norm": 1.9296875, |
| "learning_rate": 3.611051450443551e-07, |
| "loss": 0.473, |
| "num_tokens": 12179366.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 8.393162393162394, |
| "grad_norm": 1.8046875, |
| "learning_rate": 3.538780159953348e-07, |
| "loss": 0.4526, |
| "num_tokens": 12204166.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 8.41025641025641, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.4671842941897764e-07, |
| "loss": 0.4377, |
| "num_tokens": 12228036.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 8.427350427350428, |
| "grad_norm": 1.75, |
| "learning_rate": 3.3962661064086356e-07, |
| "loss": 0.4701, |
| "num_tokens": 12252716.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 8.444444444444445, |
| "grad_norm": 1.7578125, |
| "learning_rate": 3.3260278285379225e-07, |
| "loss": 0.448, |
| "num_tokens": 12278652.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 8.461538461538462, |
| "grad_norm": 1.8359375, |
| "learning_rate": 3.256471671107617e-07, |
| "loss": 0.477, |
| "num_tokens": 12303497.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 8.478632478632479, |
| "grad_norm": 1.890625, |
| "learning_rate": 3.187599823180071e-07, |
| "loss": 0.4697, |
| "num_tokens": 12328553.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.495726495726496, |
| "grad_norm": 1.8203125, |
| "learning_rate": 3.119414452281158e-07, |
| "loss": 0.4502, |
| "num_tokens": 12354244.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 8.512820512820513, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.051917704332016e-07, |
| "loss": 0.4643, |
| "num_tokens": 12379660.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 8.52991452991453, |
| "grad_norm": 1.8984375, |
| "learning_rate": 2.98511170358155e-07, |
| "loss": 0.4925, |
| "num_tokens": 12404081.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 8.547008547008547, |
| "grad_norm": 1.8203125, |
| "learning_rate": 2.918998552539545e-07, |
| "loss": 0.4617, |
| "num_tokens": 12428994.0, |
| "step": 504 |
| }, |
| { |
| "epoch": 8.564102564102564, |
| "grad_norm": 1.9296875, |
| "learning_rate": 2.8535803319105047e-07, |
| "loss": 0.5077, |
| "num_tokens": 12454375.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 8.581196581196581, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.788859100528196e-07, |
| "loss": 0.4401, |
| "num_tokens": 12479403.0, |
| "step": 506 |
| }, |
| { |
| "epoch": 8.598290598290598, |
| "grad_norm": 1.90625, |
| "learning_rate": 2.7248368952908055e-07, |
| "loss": 0.4569, |
| "num_tokens": 12503591.0, |
| "step": 507 |
| }, |
| { |
| "epoch": 8.615384615384615, |
| "grad_norm": 1.875, |
| "learning_rate": 2.6615157310968783e-07, |
| "loss": 0.4808, |
| "num_tokens": 12528170.0, |
| "step": 508 |
| }, |
| { |
| "epoch": 8.632478632478632, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.598897600781872e-07, |
| "loss": 0.4257, |
| "num_tokens": 12552175.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 8.649572649572649, |
| "grad_norm": 1.828125, |
| "learning_rate": 2.5369844750554704e-07, |
| "loss": 0.4638, |
| "num_tokens": 12576744.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 8.649572649572649, |
| "eval_loss": 0.5507171154022217, |
| "eval_num_tokens": 12576744.0, |
| "eval_runtime": 3.7722, |
| "eval_samples_per_second": 349.662, |
| "eval_steps_per_second": 11.134, |
| "step": 510 |
| }, |
| { |
| "epoch": 8.666666666666666, |
| "grad_norm": 1.8828125, |
| "learning_rate": 2.4757783024395244e-07, |
| "loss": 0.4743, |
| "num_tokens": 12602284.0, |
| "step": 511 |
| }, |
| { |
| "epoch": 8.683760683760683, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.415281009206766e-07, |
| "loss": 0.3997, |
| "num_tokens": 12627067.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 8.7008547008547, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.355494499320149e-07, |
| "loss": 0.484, |
| "num_tokens": 12652451.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 8.717948717948717, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.2964206543729662e-07, |
| "loss": 0.4632, |
| "num_tokens": 12678633.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 8.735042735042736, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.2380613335296037e-07, |
| "loss": 0.4607, |
| "num_tokens": 12703802.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 8.752136752136753, |
| "grad_norm": 1.78125, |
| "learning_rate": 2.1804183734670277e-07, |
| "loss": 0.4545, |
| "num_tokens": 12728959.0, |
| "step": 516 |
| }, |
| { |
| "epoch": 8.76923076923077, |
| "grad_norm": 1.78125, |
| "learning_rate": 2.1234935883170048e-07, |
| "loss": 0.4693, |
| "num_tokens": 12754654.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 8.786324786324787, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.0672887696089826e-07, |
| "loss": 0.4835, |
| "num_tokens": 12779552.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 8.803418803418804, |
| "grad_norm": 1.8046875, |
| "learning_rate": 2.0118056862137358e-07, |
| "loss": 0.4516, |
| "num_tokens": 12804786.0, |
| "step": 519 |
| }, |
| { |
| "epoch": 8.820512820512821, |
| "grad_norm": 1.828125, |
| "learning_rate": 1.9570460842876532e-07, |
| "loss": 0.4568, |
| "num_tokens": 12829963.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 8.837606837606838, |
| "grad_norm": 1.7421875, |
| "learning_rate": 1.9030116872178317e-07, |
| "loss": 0.471, |
| "num_tokens": 12855101.0, |
| "step": 521 |
| }, |
| { |
| "epoch": 8.854700854700855, |
| "grad_norm": 1.875, |
| "learning_rate": 1.8497041955678057e-07, |
| "loss": 0.4531, |
| "num_tokens": 12880194.0, |
| "step": 522 |
| }, |
| { |
| "epoch": 8.871794871794872, |
| "grad_norm": 1.859375, |
| "learning_rate": 1.7971252870240292e-07, |
| "loss": 0.4766, |
| "num_tokens": 12905170.0, |
| "step": 523 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.7452766163430973e-07, |
| "loss": 0.4839, |
| "num_tokens": 12931567.0, |
| "step": 524 |
| }, |
| { |
| "epoch": 8.905982905982906, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.6941598152996453e-07, |
| "loss": 0.4803, |
| "num_tokens": 12957115.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 8.923076923076923, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.6437764926350074e-07, |
| "loss": 0.4782, |
| "num_tokens": 12982463.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 8.94017094017094, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.59412823400657e-07, |
| "loss": 0.4778, |
| "num_tokens": 13006357.0, |
| "step": 527 |
| }, |
| { |
| "epoch": 8.957264957264957, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.5452166019378989e-07, |
| "loss": 0.5211, |
| "num_tokens": 13030925.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 8.974358974358974, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.4970431357695241e-07, |
| "loss": 0.4759, |
| "num_tokens": 13055908.0, |
| "step": 529 |
| }, |
| { |
| "epoch": 8.991452991452991, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.449609351610526e-07, |
| "loss": 0.4291, |
| "num_tokens": 13080922.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.4029167422908107e-07, |
| "loss": 0.5111, |
| "num_tokens": 13090149.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 9.017094017094017, |
| "grad_norm": 1.765625, |
| "learning_rate": 1.3569667773141143e-07, |
| "loss": 0.4281, |
| "num_tokens": 13114358.0, |
| "step": 532 |
| }, |
| { |
| "epoch": 9.034188034188034, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.3117609028117816e-07, |
| "loss": 0.4658, |
| "num_tokens": 13139842.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 9.051282051282051, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.2673005414972184e-07, |
| "loss": 0.453, |
| "num_tokens": 13164650.0, |
| "step": 534 |
| }, |
| { |
| "epoch": 9.068376068376068, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.223587092621162e-07, |
| "loss": 0.4579, |
| "num_tokens": 13189596.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 9.085470085470085, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.1806219319275918e-07, |
| "loss": 0.4829, |
| "num_tokens": 13214616.0, |
| "step": 536 |
| }, |
| { |
| "epoch": 9.102564102564102, |
| "grad_norm": 1.75, |
| "learning_rate": 1.138406411610482e-07, |
| "loss": 0.4632, |
| "num_tokens": 13239530.0, |
| "step": 537 |
| }, |
| { |
| "epoch": 9.11965811965812, |
| "grad_norm": 1.7890625, |
| "learning_rate": 1.0969418602712001e-07, |
| "loss": 0.4663, |
| "num_tokens": 13264484.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 9.136752136752136, |
| "grad_norm": 1.9140625, |
| "learning_rate": 1.0562295828767388e-07, |
| "loss": 0.4653, |
| "num_tokens": 13288508.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 9.153846153846153, |
| "grad_norm": 1.953125, |
| "learning_rate": 1.0162708607186044e-07, |
| "loss": 0.4705, |
| "num_tokens": 13312812.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 9.153846153846153, |
| "eval_loss": 0.550665557384491, |
| "eval_num_tokens": 13312812.0, |
| "eval_runtime": 3.7846, |
| "eval_samples_per_second": 348.522, |
| "eval_steps_per_second": 11.098, |
| "step": 540 |
| }, |
| { |
| "epoch": 9.17094017094017, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.770669513725128e-08, |
| "loss": 0.4354, |
| "num_tokens": 13337364.0, |
| "step": 541 |
| }, |
| { |
| "epoch": 9.188034188034187, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.386190886588208e-08, |
| "loss": 0.4643, |
| "num_tokens": 13362934.0, |
| "step": 542 |
| }, |
| { |
| "epoch": 9.205128205128204, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.00928482603669e-08, |
| "loss": 0.4821, |
| "num_tokens": 13387608.0, |
| "step": 543 |
| }, |
| { |
| "epoch": 9.222222222222221, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.639963194009282e-08, |
| "loss": 0.4411, |
| "num_tokens": 13412296.0, |
| "step": 544 |
| }, |
| { |
| "epoch": 9.239316239316238, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.278237613748408e-08, |
| "loss": 0.4886, |
| "num_tokens": 13438087.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 9.256410256410255, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.924119469434666e-08, |
| "loss": 0.5127, |
| "num_tokens": 13462986.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 9.273504273504274, |
| "grad_norm": 2.0, |
| "learning_rate": 7.577619905828281e-08, |
| "loss": 0.4918, |
| "num_tokens": 13486219.0, |
| "step": 547 |
| }, |
| { |
| "epoch": 9.290598290598291, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.238749827918639e-08, |
| "loss": 0.4315, |
| "num_tokens": 13511066.0, |
| "step": 548 |
| }, |
| { |
| "epoch": 9.307692307692308, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.907519900580862e-08, |
| "loss": 0.4421, |
| "num_tokens": 13536763.0, |
| "step": 549 |
| }, |
| { |
| "epoch": 9.324786324786325, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.583940548240186e-08, |
| "loss": 0.4938, |
| "num_tokens": 13562043.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 9.341880341880342, |
| "grad_norm": 1.6796875, |
| "learning_rate": 6.268021954544095e-08, |
| "loss": 0.4603, |
| "num_tokens": 13587428.0, |
| "step": 551 |
| }, |
| { |
| "epoch": 9.35897435897436, |
| "grad_norm": 1.9296875, |
| "learning_rate": 5.95977406204154e-08, |
| "loss": 0.4731, |
| "num_tokens": 13611667.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 9.376068376068377, |
| "grad_norm": 1.859375, |
| "learning_rate": 5.659206571870218e-08, |
| "loss": 0.4834, |
| "num_tokens": 13637877.0, |
| "step": 553 |
| }, |
| { |
| "epoch": 9.393162393162394, |
| "grad_norm": 1.921875, |
| "learning_rate": 5.366328943451154e-08, |
| "loss": 0.4741, |
| "num_tokens": 13662051.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 9.41025641025641, |
| "grad_norm": 1.8671875, |
| "learning_rate": 5.0811503941911314e-08, |
| "loss": 0.4699, |
| "num_tokens": 13687351.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 9.427350427350428, |
| "grad_norm": 1.828125, |
| "learning_rate": 4.8036798991923925e-08, |
| "loss": 0.4709, |
| "num_tokens": 13712712.0, |
| "step": 556 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.5339261909704e-08, |
| "loss": 0.4521, |
| "num_tokens": 13737853.0, |
| "step": 557 |
| }, |
| { |
| "epoch": 9.461538461538462, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.2718977591788836e-08, |
| "loss": 0.4645, |
| "num_tokens": 13762998.0, |
| "step": 558 |
| }, |
| { |
| "epoch": 9.478632478632479, |
| "grad_norm": 1.9453125, |
| "learning_rate": 4.017602850342584e-08, |
| "loss": 0.4701, |
| "num_tokens": 13787441.0, |
| "step": 559 |
| }, |
| { |
| "epoch": 9.495726495726496, |
| "grad_norm": 1.84375, |
| "learning_rate": 3.771049467597959e-08, |
| "loss": 0.5049, |
| "num_tokens": 13813278.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 9.512820512820513, |
| "grad_norm": 1.8671875, |
| "learning_rate": 3.5322453704410286e-08, |
| "loss": 0.4402, |
| "num_tokens": 13837870.0, |
| "step": 561 |
| }, |
| { |
| "epoch": 9.52991452991453, |
| "grad_norm": 1.8828125, |
| "learning_rate": 3.3011980744833974e-08, |
| "loss": 0.4524, |
| "num_tokens": 13861602.0, |
| "step": 562 |
| }, |
| { |
| "epoch": 9.547008547008547, |
| "grad_norm": 1.8515625, |
| "learning_rate": 3.077914851215585e-08, |
| "loss": 0.4996, |
| "num_tokens": 13887368.0, |
| "step": 563 |
| }, |
| { |
| "epoch": 9.564102564102564, |
| "grad_norm": 1.84375, |
| "learning_rate": 2.8624027277781852e-08, |
| "loss": 0.4986, |
| "num_tokens": 13912654.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 9.581196581196581, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.6546684867408412e-08, |
| "loss": 0.4248, |
| "num_tokens": 13937672.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 9.598290598290598, |
| "grad_norm": 1.8515625, |
| "learning_rate": 2.454718665888589e-08, |
| "loss": 0.468, |
| "num_tokens": 13963103.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 9.615384615384615, |
| "grad_norm": 1.7421875, |
| "learning_rate": 2.262559558016325e-08, |
| "loss": 0.4627, |
| "num_tokens": 13987379.0, |
| "step": 567 |
| }, |
| { |
| "epoch": 9.632478632478632, |
| "grad_norm": 1.7890625, |
| "learning_rate": 2.078197210730465e-08, |
| "loss": 0.4802, |
| "num_tokens": 14011324.0, |
| "step": 568 |
| }, |
| { |
| "epoch": 9.649572649572649, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.9016374262589842e-08, |
| "loss": 0.4588, |
| "num_tokens": 14036876.0, |
| "step": 569 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.732885761268427e-08, |
| "loss": 0.4391, |
| "num_tokens": 14061453.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "eval_loss": 0.550795316696167, |
| "eval_num_tokens": 14061453.0, |
| "eval_runtime": 3.8222, |
| "eval_samples_per_second": 345.091, |
| "eval_steps_per_second": 10.988, |
| "step": 570 |
| }, |
| { |
| "epoch": 9.683760683760683, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.571947526689349e-08, |
| "loss": 0.4896, |
| "num_tokens": 14086842.0, |
| "step": 571 |
| }, |
| { |
| "epoch": 9.7008547008547, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.418827787548982e-08, |
| "loss": 0.4609, |
| "num_tokens": 14112308.0, |
| "step": 572 |
| }, |
| { |
| "epoch": 9.717948717948717, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.273531362811914e-08, |
| "loss": 0.4825, |
| "num_tokens": 14137169.0, |
| "step": 573 |
| }, |
| { |
| "epoch": 9.735042735042736, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.1360628252283513e-08, |
| "loss": 0.4742, |
| "num_tokens": 14161587.0, |
| "step": 574 |
| }, |
| { |
| "epoch": 9.752136752136753, |
| "grad_norm": 1.78125, |
| "learning_rate": 1.006426501190233e-08, |
| "loss": 0.4446, |
| "num_tokens": 14186463.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 9.76923076923077, |
| "grad_norm": 1.75, |
| "learning_rate": 8.84626470595229e-09, |
| "loss": 0.4671, |
| "num_tokens": 14211825.0, |
| "step": 576 |
| }, |
| { |
| "epoch": 9.786324786324787, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.70666566718009e-09, |
| "loss": 0.4834, |
| "num_tokens": 14237567.0, |
| "step": 577 |
| }, |
| { |
| "epoch": 9.803418803418804, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.645503760899508e-09, |
| "loss": 0.4333, |
| "num_tokens": 14262209.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 9.820512820512821, |
| "grad_norm": 1.8046875, |
| "learning_rate": 5.662812383859795e-09, |
| "loss": 0.4579, |
| "num_tokens": 14287579.0, |
| "step": 579 |
| }, |
| { |
| "epoch": 9.837606837606838, |
| "grad_norm": 1.84375, |
| "learning_rate": 4.758622463196805e-09, |
| "loss": 0.4538, |
| "num_tokens": 14311419.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 9.854700854700855, |
| "grad_norm": 1.875, |
| "learning_rate": 3.932962455458489e-09, |
| "loss": 0.4787, |
| "num_tokens": 14336303.0, |
| "step": 581 |
| }, |
| { |
| "epoch": 9.871794871794872, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.1858583457095026e-09, |
| "loss": 0.4192, |
| "num_tokens": 14361504.0, |
| "step": 582 |
| }, |
| { |
| "epoch": 9.88888888888889, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.5173336467135266e-09, |
| "loss": 0.4746, |
| "num_tokens": 14386137.0, |
| "step": 583 |
| }, |
| { |
| "epoch": 9.905982905982906, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.9274093981927476e-09, |
| "loss": 0.4572, |
| "num_tokens": 14410674.0, |
| "step": 584 |
| }, |
| { |
| "epoch": 9.923076923076923, |
| "grad_norm": 1.8671875, |
| "learning_rate": 1.4161041661667208e-09, |
| "loss": 0.4646, |
| "num_tokens": 14434870.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 9.94017094017094, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.834340423678368e-10, |
| "loss": 0.4593, |
| "num_tokens": 14460092.0, |
| "step": 586 |
| }, |
| { |
| "epoch": 9.957264957264957, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.294126437336734e-10, |
| "loss": 0.4751, |
| "num_tokens": 14485799.0, |
| "step": 587 |
| }, |
| { |
| "epoch": 9.974358974358974, |
| "grad_norm": 1.78125, |
| "learning_rate": 3.5405111197955866e-10, |
| "loss": 0.4483, |
| "num_tokens": 14510709.0, |
| "step": 588 |
| }, |
| { |
| "epoch": 9.991452991452991, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.5735811324857354e-10, |
| "loss": 0.4797, |
| "num_tokens": 14534764.0, |
| "step": 589 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 2.734375, |
| "learning_rate": 3.933983783677153e-11, |
| "loss": 0.4571, |
| "num_tokens": 14544610.0, |
| "step": 590 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 590, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3842096471023616e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|