| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9902912621359223, |
| "eval_steps": 500, |
| "global_step": 308, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006472491909385114, |
| "grad_norm": 5.306275844573975, |
| "learning_rate": 0.0, |
| "loss": 1.1501, |
| "num_tokens": 963559.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.012944983818770227, |
| "grad_norm": 5.308248996734619, |
| "learning_rate": 3.2258064516129035e-07, |
| "loss": 1.1455, |
| "num_tokens": 1927760.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.019417475728155338, |
| "grad_norm": 5.251079559326172, |
| "learning_rate": 6.451612903225807e-07, |
| "loss": 1.1391, |
| "num_tokens": 2896369.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.025889967637540454, |
| "grad_norm": 5.1489105224609375, |
| "learning_rate": 9.67741935483871e-07, |
| "loss": 1.1296, |
| "num_tokens": 3886583.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.032362459546925564, |
| "grad_norm": 4.995655059814453, |
| "learning_rate": 1.2903225806451614e-06, |
| "loss": 1.1222, |
| "num_tokens": 4864476.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.038834951456310676, |
| "grad_norm": 4.873946189880371, |
| "learning_rate": 1.6129032258064516e-06, |
| "loss": 1.1242, |
| "num_tokens": 5873295.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.045307443365695796, |
| "grad_norm": 4.26630973815918, |
| "learning_rate": 1.935483870967742e-06, |
| "loss": 1.0614, |
| "num_tokens": 6876484.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.05177993527508091, |
| "grad_norm": 4.14864444732666, |
| "learning_rate": 2.2580645161290324e-06, |
| "loss": 1.0421, |
| "num_tokens": 7871664.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.05825242718446602, |
| "grad_norm": 3.062345504760742, |
| "learning_rate": 2.580645161290323e-06, |
| "loss": 0.9609, |
| "num_tokens": 8869931.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.06472491909385113, |
| "grad_norm": 2.806725263595581, |
| "learning_rate": 2.903225806451613e-06, |
| "loss": 0.9313, |
| "num_tokens": 9874687.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07119741100323625, |
| "grad_norm": 2.670482873916626, |
| "learning_rate": 3.225806451612903e-06, |
| "loss": 0.9088, |
| "num_tokens": 10842628.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.07766990291262135, |
| "grad_norm": 1.8532583713531494, |
| "learning_rate": 3.548387096774194e-06, |
| "loss": 0.8088, |
| "num_tokens": 11815438.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.08414239482200647, |
| "grad_norm": 1.6065739393234253, |
| "learning_rate": 3.870967741935484e-06, |
| "loss": 0.7868, |
| "num_tokens": 12788222.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.09061488673139159, |
| "grad_norm": 1.3665945529937744, |
| "learning_rate": 4.193548387096774e-06, |
| "loss": 0.7594, |
| "num_tokens": 13756270.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0970873786407767, |
| "grad_norm": 1.265770673751831, |
| "learning_rate": 4.516129032258065e-06, |
| "loss": 0.7406, |
| "num_tokens": 14689813.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.10355987055016182, |
| "grad_norm": 1.1766362190246582, |
| "learning_rate": 4.838709677419355e-06, |
| "loss": 0.6938, |
| "num_tokens": 15670657.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.11003236245954692, |
| "grad_norm": 1.0507408380508423, |
| "learning_rate": 5.161290322580646e-06, |
| "loss": 0.6834, |
| "num_tokens": 16640491.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.11650485436893204, |
| "grad_norm": 0.8553951382637024, |
| "learning_rate": 5.483870967741935e-06, |
| "loss": 0.6743, |
| "num_tokens": 17611388.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.12297734627831715, |
| "grad_norm": 0.6638854146003723, |
| "learning_rate": 5.806451612903226e-06, |
| "loss": 0.6604, |
| "num_tokens": 18582653.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.12944983818770225, |
| "grad_norm": 0.5444458723068237, |
| "learning_rate": 6.129032258064517e-06, |
| "loss": 0.6427, |
| "num_tokens": 19557459.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.13592233009708737, |
| "grad_norm": 0.4539978504180908, |
| "learning_rate": 6.451612903225806e-06, |
| "loss": 0.6348, |
| "num_tokens": 20524158.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.1423948220064725, |
| "grad_norm": 0.491314172744751, |
| "learning_rate": 6.774193548387097e-06, |
| "loss": 0.6084, |
| "num_tokens": 21502674.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.1488673139158576, |
| "grad_norm": 0.47383806109428406, |
| "learning_rate": 7.096774193548388e-06, |
| "loss": 0.603, |
| "num_tokens": 22446294.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.1553398058252427, |
| "grad_norm": 0.3826221525669098, |
| "learning_rate": 7.4193548387096784e-06, |
| "loss": 0.5929, |
| "num_tokens": 23430225.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.16181229773462782, |
| "grad_norm": 0.33175480365753174, |
| "learning_rate": 7.741935483870968e-06, |
| "loss": 0.5837, |
| "num_tokens": 24380419.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.16828478964401294, |
| "grad_norm": 0.3459703326225281, |
| "learning_rate": 8.064516129032258e-06, |
| "loss": 0.5689, |
| "num_tokens": 25343499.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.17475728155339806, |
| "grad_norm": 0.3513648509979248, |
| "learning_rate": 8.387096774193549e-06, |
| "loss": 0.5649, |
| "num_tokens": 26315921.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.18122977346278318, |
| "grad_norm": 0.28894567489624023, |
| "learning_rate": 8.70967741935484e-06, |
| "loss": 0.5654, |
| "num_tokens": 27255686.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.18770226537216828, |
| "grad_norm": 0.2634493112564087, |
| "learning_rate": 9.03225806451613e-06, |
| "loss": 0.555, |
| "num_tokens": 28232449.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.1941747572815534, |
| "grad_norm": 0.25820741057395935, |
| "learning_rate": 9.35483870967742e-06, |
| "loss": 0.5464, |
| "num_tokens": 29213099.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.20064724919093851, |
| "grad_norm": 0.22817398607730865, |
| "learning_rate": 9.67741935483871e-06, |
| "loss": 0.5437, |
| "num_tokens": 30210546.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.20711974110032363, |
| "grad_norm": 0.2116086781024933, |
| "learning_rate": 1e-05, |
| "loss": 0.5284, |
| "num_tokens": 31190215.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.21359223300970873, |
| "grad_norm": 0.22091032564640045, |
| "learning_rate": 1e-05, |
| "loss": 0.5235, |
| "num_tokens": 32152368.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.22006472491909385, |
| "grad_norm": 0.21065133810043335, |
| "learning_rate": 1e-05, |
| "loss": 0.5313, |
| "num_tokens": 33138410.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.22653721682847897, |
| "grad_norm": 0.1995198130607605, |
| "learning_rate": 1e-05, |
| "loss": 0.5205, |
| "num_tokens": 34103559.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.23300970873786409, |
| "grad_norm": 0.193314790725708, |
| "learning_rate": 1e-05, |
| "loss": 0.5202, |
| "num_tokens": 35099702.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.23948220064724918, |
| "grad_norm": 0.1861106902360916, |
| "learning_rate": 1e-05, |
| "loss": 0.5186, |
| "num_tokens": 36067246.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.2459546925566343, |
| "grad_norm": 0.1703341007232666, |
| "learning_rate": 1e-05, |
| "loss": 0.5074, |
| "num_tokens": 37074157.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2524271844660194, |
| "grad_norm": 0.19160355627536774, |
| "learning_rate": 1e-05, |
| "loss": 0.5029, |
| "num_tokens": 38026864.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.2588996763754045, |
| "grad_norm": 0.17060473561286926, |
| "learning_rate": 1e-05, |
| "loss": 0.4967, |
| "num_tokens": 39007950.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.26537216828478966, |
| "grad_norm": 0.17483553290367126, |
| "learning_rate": 1e-05, |
| "loss": 0.5045, |
| "num_tokens": 40004404.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.27184466019417475, |
| "grad_norm": 0.16585178673267365, |
| "learning_rate": 1e-05, |
| "loss": 0.4985, |
| "num_tokens": 40979367.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.2783171521035599, |
| "grad_norm": 0.16377580165863037, |
| "learning_rate": 1e-05, |
| "loss": 0.4941, |
| "num_tokens": 41954716.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.284789644012945, |
| "grad_norm": 0.16064569354057312, |
| "learning_rate": 1e-05, |
| "loss": 0.5018, |
| "num_tokens": 42927994.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.2912621359223301, |
| "grad_norm": 0.15038461983203888, |
| "learning_rate": 1e-05, |
| "loss": 0.4892, |
| "num_tokens": 43870458.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.2977346278317152, |
| "grad_norm": 0.15048164129257202, |
| "learning_rate": 1e-05, |
| "loss": 0.4884, |
| "num_tokens": 44845205.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.3042071197411003, |
| "grad_norm": 0.15347424149513245, |
| "learning_rate": 1e-05, |
| "loss": 0.4861, |
| "num_tokens": 45817785.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.3106796116504854, |
| "grad_norm": 0.14698895812034607, |
| "learning_rate": 1e-05, |
| "loss": 0.4881, |
| "num_tokens": 46768552.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.31715210355987056, |
| "grad_norm": 0.14178597927093506, |
| "learning_rate": 1e-05, |
| "loss": 0.4721, |
| "num_tokens": 47720390.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.32362459546925565, |
| "grad_norm": 0.15780387818813324, |
| "learning_rate": 1e-05, |
| "loss": 0.4818, |
| "num_tokens": 48659119.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.3300970873786408, |
| "grad_norm": 0.15007564425468445, |
| "learning_rate": 1e-05, |
| "loss": 0.4812, |
| "num_tokens": 49635896.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.3365695792880259, |
| "grad_norm": 0.14719824492931366, |
| "learning_rate": 1e-05, |
| "loss": 0.4795, |
| "num_tokens": 50617481.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.343042071197411, |
| "grad_norm": 0.14689336717128754, |
| "learning_rate": 1e-05, |
| "loss": 0.4749, |
| "num_tokens": 51580322.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.34951456310679613, |
| "grad_norm": 0.14928674697875977, |
| "learning_rate": 1e-05, |
| "loss": 0.4772, |
| "num_tokens": 52544983.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.3559870550161812, |
| "grad_norm": 0.14493519067764282, |
| "learning_rate": 1e-05, |
| "loss": 0.4696, |
| "num_tokens": 53538431.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.36245954692556637, |
| "grad_norm": 0.1502736210823059, |
| "learning_rate": 1e-05, |
| "loss": 0.4701, |
| "num_tokens": 54497097.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.36893203883495146, |
| "grad_norm": 0.13822625577449799, |
| "learning_rate": 1e-05, |
| "loss": 0.46, |
| "num_tokens": 55515695.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.37540453074433655, |
| "grad_norm": 0.15165378153324127, |
| "learning_rate": 1e-05, |
| "loss": 0.463, |
| "num_tokens": 56471100.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.3818770226537217, |
| "grad_norm": 0.1460576206445694, |
| "learning_rate": 1e-05, |
| "loss": 0.4681, |
| "num_tokens": 57460948.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.3883495145631068, |
| "grad_norm": 0.1480175405740738, |
| "learning_rate": 1e-05, |
| "loss": 0.4673, |
| "num_tokens": 58450663.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.3948220064724919, |
| "grad_norm": 0.15429073572158813, |
| "learning_rate": 1e-05, |
| "loss": 0.4617, |
| "num_tokens": 59407638.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.40129449838187703, |
| "grad_norm": 0.14732158184051514, |
| "learning_rate": 1e-05, |
| "loss": 0.4577, |
| "num_tokens": 60358388.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.4077669902912621, |
| "grad_norm": 0.1532926857471466, |
| "learning_rate": 1e-05, |
| "loss": 0.4534, |
| "num_tokens": 61306137.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.41423948220064727, |
| "grad_norm": 0.15105584263801575, |
| "learning_rate": 1e-05, |
| "loss": 0.4643, |
| "num_tokens": 62298616.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.42071197411003236, |
| "grad_norm": 0.15281681716442108, |
| "learning_rate": 1e-05, |
| "loss": 0.4544, |
| "num_tokens": 63242620.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.42718446601941745, |
| "grad_norm": 0.14645300805568695, |
| "learning_rate": 1e-05, |
| "loss": 0.4711, |
| "num_tokens": 64231555.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.4336569579288026, |
| "grad_norm": 0.1446738988161087, |
| "learning_rate": 1e-05, |
| "loss": 0.4586, |
| "num_tokens": 65190302.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.4401294498381877, |
| "grad_norm": 0.15015272796154022, |
| "learning_rate": 1e-05, |
| "loss": 0.4581, |
| "num_tokens": 66175194.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.44660194174757284, |
| "grad_norm": 0.14811600744724274, |
| "learning_rate": 1e-05, |
| "loss": 0.4547, |
| "num_tokens": 67146659.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.45307443365695793, |
| "grad_norm": 0.15160052478313446, |
| "learning_rate": 1e-05, |
| "loss": 0.4389, |
| "num_tokens": 68115421.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.459546925566343, |
| "grad_norm": 0.15062326192855835, |
| "learning_rate": 1e-05, |
| "loss": 0.4372, |
| "num_tokens": 69078962.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.46601941747572817, |
| "grad_norm": 0.15071865916252136, |
| "learning_rate": 1e-05, |
| "loss": 0.4611, |
| "num_tokens": 70056552.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.47249190938511326, |
| "grad_norm": 0.1435338407754898, |
| "learning_rate": 1e-05, |
| "loss": 0.4549, |
| "num_tokens": 71056405.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.47896440129449835, |
| "grad_norm": 0.1523878276348114, |
| "learning_rate": 1e-05, |
| "loss": 0.4652, |
| "num_tokens": 72011963.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.4854368932038835, |
| "grad_norm": 0.1481529027223587, |
| "learning_rate": 1e-05, |
| "loss": 0.4524, |
| "num_tokens": 72974088.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4919093851132686, |
| "grad_norm": 0.14620022475719452, |
| "learning_rate": 1e-05, |
| "loss": 0.4472, |
| "num_tokens": 73963066.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.49838187702265374, |
| "grad_norm": 0.14328692853450775, |
| "learning_rate": 1e-05, |
| "loss": 0.4397, |
| "num_tokens": 74927507.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.5048543689320388, |
| "grad_norm": 0.14814729988574982, |
| "learning_rate": 1e-05, |
| "loss": 0.4445, |
| "num_tokens": 75913302.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.511326860841424, |
| "grad_norm": 0.14375941455364227, |
| "learning_rate": 1e-05, |
| "loss": 0.4532, |
| "num_tokens": 76871637.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.517799352750809, |
| "grad_norm": 0.15578249096870422, |
| "learning_rate": 1e-05, |
| "loss": 0.4472, |
| "num_tokens": 77844738.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5242718446601942, |
| "grad_norm": 0.14302082359790802, |
| "learning_rate": 1e-05, |
| "loss": 0.4407, |
| "num_tokens": 78789735.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.5307443365695793, |
| "grad_norm": 0.16008679568767548, |
| "learning_rate": 1e-05, |
| "loss": 0.4382, |
| "num_tokens": 79740057.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.5372168284789643, |
| "grad_norm": 0.14800778031349182, |
| "learning_rate": 1e-05, |
| "loss": 0.4451, |
| "num_tokens": 80691344.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.5436893203883495, |
| "grad_norm": 0.14400288462638855, |
| "learning_rate": 1e-05, |
| "loss": 0.4441, |
| "num_tokens": 81695065.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.5501618122977346, |
| "grad_norm": 0.13986420631408691, |
| "learning_rate": 1e-05, |
| "loss": 0.4414, |
| "num_tokens": 82670591.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.5566343042071198, |
| "grad_norm": 0.15451516211032867, |
| "learning_rate": 1e-05, |
| "loss": 0.4415, |
| "num_tokens": 83619525.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.5631067961165048, |
| "grad_norm": 0.14956173300743103, |
| "learning_rate": 1e-05, |
| "loss": 0.441, |
| "num_tokens": 84583829.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.56957928802589, |
| "grad_norm": 0.15179790556430817, |
| "learning_rate": 1e-05, |
| "loss": 0.4407, |
| "num_tokens": 85554557.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.5760517799352751, |
| "grad_norm": 0.15142634510993958, |
| "learning_rate": 1e-05, |
| "loss": 0.442, |
| "num_tokens": 86534800.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.5825242718446602, |
| "grad_norm": 0.16368825733661652, |
| "learning_rate": 1e-05, |
| "loss": 0.4456, |
| "num_tokens": 87479322.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.5889967637540453, |
| "grad_norm": 0.16267365217208862, |
| "learning_rate": 1e-05, |
| "loss": 0.4436, |
| "num_tokens": 88440604.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.5954692556634305, |
| "grad_norm": 0.1441015601158142, |
| "learning_rate": 1e-05, |
| "loss": 0.4407, |
| "num_tokens": 89440902.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.6019417475728155, |
| "grad_norm": 0.15438257157802582, |
| "learning_rate": 1e-05, |
| "loss": 0.4335, |
| "num_tokens": 90420292.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.6084142394822006, |
| "grad_norm": 0.15095576643943787, |
| "learning_rate": 1e-05, |
| "loss": 0.4427, |
| "num_tokens": 91365927.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.6148867313915858, |
| "grad_norm": 0.15472491085529327, |
| "learning_rate": 1e-05, |
| "loss": 0.4369, |
| "num_tokens": 92339481.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6213592233009708, |
| "grad_norm": 0.15544648468494415, |
| "learning_rate": 1e-05, |
| "loss": 0.4323, |
| "num_tokens": 93277920.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.627831715210356, |
| "grad_norm": 0.1473296582698822, |
| "learning_rate": 1e-05, |
| "loss": 0.4314, |
| "num_tokens": 94233058.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.6343042071197411, |
| "grad_norm": 0.15555965900421143, |
| "learning_rate": 1e-05, |
| "loss": 0.4443, |
| "num_tokens": 95195865.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.6407766990291263, |
| "grad_norm": 0.15503650903701782, |
| "learning_rate": 1e-05, |
| "loss": 0.4269, |
| "num_tokens": 96165533.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.6472491909385113, |
| "grad_norm": 0.1431349366903305, |
| "learning_rate": 1e-05, |
| "loss": 0.4291, |
| "num_tokens": 97128908.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6537216828478964, |
| "grad_norm": 0.1569562703371048, |
| "learning_rate": 1e-05, |
| "loss": 0.4301, |
| "num_tokens": 98137812.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.6601941747572816, |
| "grad_norm": 0.14877967536449432, |
| "learning_rate": 1e-05, |
| "loss": 0.4264, |
| "num_tokens": 99133445.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.16731275618076324, |
| "learning_rate": 1e-05, |
| "loss": 0.4193, |
| "num_tokens": 100106034.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.6731391585760518, |
| "grad_norm": 0.15022587776184082, |
| "learning_rate": 1e-05, |
| "loss": 0.4466, |
| "num_tokens": 101060830.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.6796116504854369, |
| "grad_norm": 0.15176573395729065, |
| "learning_rate": 1e-05, |
| "loss": 0.431, |
| "num_tokens": 102032708.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.686084142394822, |
| "grad_norm": 0.1639350950717926, |
| "learning_rate": 1e-05, |
| "loss": 0.4145, |
| "num_tokens": 103020888.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.6925566343042071, |
| "grad_norm": 0.1504807472229004, |
| "learning_rate": 1e-05, |
| "loss": 0.4377, |
| "num_tokens": 103986883.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.6990291262135923, |
| "grad_norm": 0.16267862915992737, |
| "learning_rate": 1e-05, |
| "loss": 0.4348, |
| "num_tokens": 104978514.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.7055016181229773, |
| "grad_norm": 0.14575445652008057, |
| "learning_rate": 1e-05, |
| "loss": 0.423, |
| "num_tokens": 105938542.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.7119741100323624, |
| "grad_norm": 0.17577598989009857, |
| "learning_rate": 1e-05, |
| "loss": 0.425, |
| "num_tokens": 106909993.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7184466019417476, |
| "grad_norm": 0.14932656288146973, |
| "learning_rate": 1e-05, |
| "loss": 0.4292, |
| "num_tokens": 107872313.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.7249190938511327, |
| "grad_norm": 0.15973471105098724, |
| "learning_rate": 1e-05, |
| "loss": 0.423, |
| "num_tokens": 108876000.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.7313915857605178, |
| "grad_norm": 0.16627554595470428, |
| "learning_rate": 1e-05, |
| "loss": 0.4234, |
| "num_tokens": 109857790.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.7378640776699029, |
| "grad_norm": 0.1428242325782776, |
| "learning_rate": 1e-05, |
| "loss": 0.4242, |
| "num_tokens": 110826497.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.7443365695792881, |
| "grad_norm": 0.15781018137931824, |
| "learning_rate": 1e-05, |
| "loss": 0.4228, |
| "num_tokens": 111775780.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.7508090614886731, |
| "grad_norm": 0.15125828981399536, |
| "learning_rate": 1e-05, |
| "loss": 0.4248, |
| "num_tokens": 112755203.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.7572815533980582, |
| "grad_norm": 0.16092541813850403, |
| "learning_rate": 1e-05, |
| "loss": 0.4246, |
| "num_tokens": 113720983.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.7637540453074434, |
| "grad_norm": 0.14613084495067596, |
| "learning_rate": 1e-05, |
| "loss": 0.4156, |
| "num_tokens": 114690500.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.7702265372168284, |
| "grad_norm": 0.16003067791461945, |
| "learning_rate": 1e-05, |
| "loss": 0.4162, |
| "num_tokens": 115663752.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.7766990291262136, |
| "grad_norm": 0.15294891595840454, |
| "learning_rate": 1e-05, |
| "loss": 0.4136, |
| "num_tokens": 116656118.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7831715210355987, |
| "grad_norm": 0.14649586379528046, |
| "learning_rate": 1e-05, |
| "loss": 0.4226, |
| "num_tokens": 117647006.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.7896440129449838, |
| "grad_norm": 0.15786471962928772, |
| "learning_rate": 1e-05, |
| "loss": 0.4139, |
| "num_tokens": 118637898.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.7961165048543689, |
| "grad_norm": 0.15297958254814148, |
| "learning_rate": 1e-05, |
| "loss": 0.4234, |
| "num_tokens": 119611539.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.8025889967637541, |
| "grad_norm": 0.15327708423137665, |
| "learning_rate": 1e-05, |
| "loss": 0.4252, |
| "num_tokens": 120576485.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.8090614886731392, |
| "grad_norm": 0.14421789348125458, |
| "learning_rate": 1e-05, |
| "loss": 0.4253, |
| "num_tokens": 121567862.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8155339805825242, |
| "grad_norm": 0.15074017643928528, |
| "learning_rate": 1e-05, |
| "loss": 0.4156, |
| "num_tokens": 122530893.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.8220064724919094, |
| "grad_norm": 0.15563052892684937, |
| "learning_rate": 1e-05, |
| "loss": 0.4176, |
| "num_tokens": 123480227.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.8284789644012945, |
| "grad_norm": 0.13963429629802704, |
| "learning_rate": 1e-05, |
| "loss": 0.4312, |
| "num_tokens": 124462070.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.8349514563106796, |
| "grad_norm": 0.14507392048835754, |
| "learning_rate": 1e-05, |
| "loss": 0.4246, |
| "num_tokens": 125440428.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.8414239482200647, |
| "grad_norm": 0.14936408400535583, |
| "learning_rate": 1e-05, |
| "loss": 0.4214, |
| "num_tokens": 126416310.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.8478964401294499, |
| "grad_norm": 0.14725163578987122, |
| "learning_rate": 1e-05, |
| "loss": 0.4221, |
| "num_tokens": 127406436.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.8543689320388349, |
| "grad_norm": 0.1589491367340088, |
| "learning_rate": 1e-05, |
| "loss": 0.4191, |
| "num_tokens": 128372197.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.86084142394822, |
| "grad_norm": 0.14656752347946167, |
| "learning_rate": 1e-05, |
| "loss": 0.4174, |
| "num_tokens": 129371264.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.8673139158576052, |
| "grad_norm": 0.14391183853149414, |
| "learning_rate": 1e-05, |
| "loss": 0.4026, |
| "num_tokens": 130306080.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.8737864077669902, |
| "grad_norm": 0.14788095653057098, |
| "learning_rate": 1e-05, |
| "loss": 0.4227, |
| "num_tokens": 131261937.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.8802588996763754, |
| "grad_norm": 0.13957837224006653, |
| "learning_rate": 1e-05, |
| "loss": 0.416, |
| "num_tokens": 132223182.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.8867313915857605, |
| "grad_norm": 0.14059896767139435, |
| "learning_rate": 1e-05, |
| "loss": 0.4194, |
| "num_tokens": 133211562.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.8932038834951457, |
| "grad_norm": 0.14847581088542938, |
| "learning_rate": 1e-05, |
| "loss": 0.4171, |
| "num_tokens": 134208133.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.8996763754045307, |
| "grad_norm": 0.14683924615383148, |
| "learning_rate": 1e-05, |
| "loss": 0.4191, |
| "num_tokens": 135194807.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.9061488673139159, |
| "grad_norm": 0.14641155302524567, |
| "learning_rate": 1e-05, |
| "loss": 0.4178, |
| "num_tokens": 136175941.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.912621359223301, |
| "grad_norm": 0.1390344202518463, |
| "learning_rate": 1e-05, |
| "loss": 0.4291, |
| "num_tokens": 137117638.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.919093851132686, |
| "grad_norm": 0.1498357057571411, |
| "learning_rate": 1e-05, |
| "loss": 0.4077, |
| "num_tokens": 138102412.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.9255663430420712, |
| "grad_norm": 0.14186346530914307, |
| "learning_rate": 1e-05, |
| "loss": 0.4099, |
| "num_tokens": 139056304.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.9320388349514563, |
| "grad_norm": 0.14950646460056305, |
| "learning_rate": 1e-05, |
| "loss": 0.4154, |
| "num_tokens": 140002988.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.9385113268608414, |
| "grad_norm": 0.14963679015636444, |
| "learning_rate": 1e-05, |
| "loss": 0.4184, |
| "num_tokens": 140939323.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9449838187702265, |
| "grad_norm": 0.15120644867420197, |
| "learning_rate": 1e-05, |
| "loss": 0.4126, |
| "num_tokens": 141898246.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.9514563106796117, |
| "grad_norm": 0.162687748670578, |
| "learning_rate": 1e-05, |
| "loss": 0.4204, |
| "num_tokens": 142861118.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.9579288025889967, |
| "grad_norm": 0.15032649040222168, |
| "learning_rate": 1e-05, |
| "loss": 0.4136, |
| "num_tokens": 143845628.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.9644012944983819, |
| "grad_norm": 0.14711233973503113, |
| "learning_rate": 1e-05, |
| "loss": 0.4253, |
| "num_tokens": 144816454.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 0.1571267545223236, |
| "learning_rate": 1e-05, |
| "loss": 0.4142, |
| "num_tokens": 145791528.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9773462783171522, |
| "grad_norm": 0.15269917249679565, |
| "learning_rate": 1e-05, |
| "loss": 0.4266, |
| "num_tokens": 146776225.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.9838187702265372, |
| "grad_norm": 0.14772002398967743, |
| "learning_rate": 1e-05, |
| "loss": 0.4096, |
| "num_tokens": 147716038.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.9902912621359223, |
| "grad_norm": 0.16296416521072388, |
| "learning_rate": 1e-05, |
| "loss": 0.407, |
| "num_tokens": 148641036.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.9967637540453075, |
| "grad_norm": 0.15487153828144073, |
| "learning_rate": 1e-05, |
| "loss": 0.4103, |
| "num_tokens": 149611850.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.15487153828144073, |
| "learning_rate": 1e-05, |
| "loss": 0.4121, |
| "num_tokens": 150082857.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.006472491909385, |
| "grad_norm": 0.23388831317424774, |
| "learning_rate": 1e-05, |
| "loss": 0.3944, |
| "num_tokens": 151065207.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.0129449838187703, |
| "grad_norm": 0.17887376248836517, |
| "learning_rate": 1e-05, |
| "loss": 0.3865, |
| "num_tokens": 152046682.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.0194174757281553, |
| "grad_norm": 0.1608133316040039, |
| "learning_rate": 1e-05, |
| "loss": 0.3896, |
| "num_tokens": 153014543.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.0258899676375404, |
| "grad_norm": 0.17408691346645355, |
| "learning_rate": 1e-05, |
| "loss": 0.3962, |
| "num_tokens": 153988434.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.0323624595469256, |
| "grad_norm": 0.1740640550851822, |
| "learning_rate": 1e-05, |
| "loss": 0.3925, |
| "num_tokens": 154938356.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.0388349514563107, |
| "grad_norm": 0.1563650667667389, |
| "learning_rate": 1e-05, |
| "loss": 0.3924, |
| "num_tokens": 155920665.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.0453074433656957, |
| "grad_norm": 0.16022861003875732, |
| "learning_rate": 1e-05, |
| "loss": 0.3946, |
| "num_tokens": 156878844.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.051779935275081, |
| "grad_norm": 0.1814146637916565, |
| "learning_rate": 1e-05, |
| "loss": 0.3855, |
| "num_tokens": 157852056.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.058252427184466, |
| "grad_norm": 0.1558738648891449, |
| "learning_rate": 1e-05, |
| "loss": 0.3948, |
| "num_tokens": 158847308.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.064724919093851, |
| "grad_norm": 0.1744164228439331, |
| "learning_rate": 1e-05, |
| "loss": 0.3968, |
| "num_tokens": 159815676.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.0711974110032363, |
| "grad_norm": 0.15834972262382507, |
| "learning_rate": 1e-05, |
| "loss": 0.3949, |
| "num_tokens": 160783247.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.0776699029126213, |
| "grad_norm": 0.16717448830604553, |
| "learning_rate": 1e-05, |
| "loss": 0.3905, |
| "num_tokens": 161757978.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.0841423948220066, |
| "grad_norm": 0.1640552282333374, |
| "learning_rate": 1e-05, |
| "loss": 0.3853, |
| "num_tokens": 162697927.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.0906148867313916, |
| "grad_norm": 0.1878693401813507, |
| "learning_rate": 1e-05, |
| "loss": 0.3935, |
| "num_tokens": 163662092.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.0970873786407767, |
| "grad_norm": 0.15845640003681183, |
| "learning_rate": 1e-05, |
| "loss": 0.3836, |
| "num_tokens": 164636005.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.103559870550162, |
| "grad_norm": 0.17992043495178223, |
| "learning_rate": 1e-05, |
| "loss": 0.3908, |
| "num_tokens": 165575709.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.110032362459547, |
| "grad_norm": 0.15234288573265076, |
| "learning_rate": 1e-05, |
| "loss": 0.3789, |
| "num_tokens": 166581172.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.116504854368932, |
| "grad_norm": 0.15983612835407257, |
| "learning_rate": 1e-05, |
| "loss": 0.389, |
| "num_tokens": 167529454.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.1229773462783172, |
| "grad_norm": 0.1675143986940384, |
| "learning_rate": 1e-05, |
| "loss": 0.3799, |
| "num_tokens": 168484414.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.1294498381877023, |
| "grad_norm": 0.15397170186042786, |
| "learning_rate": 1e-05, |
| "loss": 0.3921, |
| "num_tokens": 169452401.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.1359223300970873, |
| "grad_norm": 0.17627382278442383, |
| "learning_rate": 1e-05, |
| "loss": 0.383, |
| "num_tokens": 170407712.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.1423948220064726, |
| "grad_norm": 0.18198609352111816, |
| "learning_rate": 1e-05, |
| "loss": 0.3901, |
| "num_tokens": 171418075.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.1488673139158576, |
| "grad_norm": 0.1528196781873703, |
| "learning_rate": 1e-05, |
| "loss": 0.3893, |
| "num_tokens": 172409907.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.1553398058252426, |
| "grad_norm": 0.17974089086055756, |
| "learning_rate": 1e-05, |
| "loss": 0.3845, |
| "num_tokens": 173368352.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.161812297734628, |
| "grad_norm": 0.16560745239257812, |
| "learning_rate": 1e-05, |
| "loss": 0.389, |
| "num_tokens": 174328021.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.168284789644013, |
| "grad_norm": 0.16693539917469025, |
| "learning_rate": 1e-05, |
| "loss": 0.3859, |
| "num_tokens": 175328614.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.174757281553398, |
| "grad_norm": 0.20071224868297577, |
| "learning_rate": 1e-05, |
| "loss": 0.3881, |
| "num_tokens": 176287442.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.1812297734627832, |
| "grad_norm": 0.1772185117006302, |
| "learning_rate": 1e-05, |
| "loss": 0.3924, |
| "num_tokens": 177272590.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.1877022653721683, |
| "grad_norm": 0.17591412365436554, |
| "learning_rate": 1e-05, |
| "loss": 0.3937, |
| "num_tokens": 178243495.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.1941747572815533, |
| "grad_norm": 0.17770753800868988, |
| "learning_rate": 1e-05, |
| "loss": 0.3895, |
| "num_tokens": 179230835.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2006472491909386, |
| "grad_norm": 0.16321398317813873, |
| "learning_rate": 1e-05, |
| "loss": 0.3842, |
| "num_tokens": 180197716.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.2071197411003236, |
| "grad_norm": 0.19365891814231873, |
| "learning_rate": 1e-05, |
| "loss": 0.397, |
| "num_tokens": 181168136.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.2135922330097086, |
| "grad_norm": 0.15928582847118378, |
| "learning_rate": 1e-05, |
| "loss": 0.3871, |
| "num_tokens": 182127126.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.220064724919094, |
| "grad_norm": 0.1753508299589157, |
| "learning_rate": 1e-05, |
| "loss": 0.3892, |
| "num_tokens": 183085892.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.226537216828479, |
| "grad_norm": 0.16824939846992493, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "num_tokens": 184057299.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.233009708737864, |
| "grad_norm": 0.1595918834209442, |
| "learning_rate": 1e-05, |
| "loss": 0.3828, |
| "num_tokens": 184998791.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.2394822006472492, |
| "grad_norm": 0.1542261838912964, |
| "learning_rate": 1e-05, |
| "loss": 0.3795, |
| "num_tokens": 185986785.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.2459546925566343, |
| "grad_norm": 0.15647530555725098, |
| "learning_rate": 1e-05, |
| "loss": 0.3742, |
| "num_tokens": 186929881.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.2524271844660193, |
| "grad_norm": 0.15532921254634857, |
| "learning_rate": 1e-05, |
| "loss": 0.39, |
| "num_tokens": 187902972.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.2588996763754046, |
| "grad_norm": 0.15017175674438477, |
| "learning_rate": 1e-05, |
| "loss": 0.375, |
| "num_tokens": 188884007.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.2653721682847896, |
| "grad_norm": 0.16391442716121674, |
| "learning_rate": 1e-05, |
| "loss": 0.3774, |
| "num_tokens": 189812757.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.2718446601941746, |
| "grad_norm": 0.14583992958068848, |
| "learning_rate": 1e-05, |
| "loss": 0.3799, |
| "num_tokens": 190775296.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.27831715210356, |
| "grad_norm": 0.15327832102775574, |
| "learning_rate": 1e-05, |
| "loss": 0.3819, |
| "num_tokens": 191750113.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.284789644012945, |
| "grad_norm": 0.152728870511055, |
| "learning_rate": 1e-05, |
| "loss": 0.383, |
| "num_tokens": 192691419.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.29126213592233, |
| "grad_norm": 0.1549627035856247, |
| "learning_rate": 1e-05, |
| "loss": 0.3882, |
| "num_tokens": 193680379.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.2977346278317152, |
| "grad_norm": 0.15266035497188568, |
| "learning_rate": 1e-05, |
| "loss": 0.3816, |
| "num_tokens": 194655500.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.3042071197411003, |
| "grad_norm": 0.1430487483739853, |
| "learning_rate": 1e-05, |
| "loss": 0.3829, |
| "num_tokens": 195598008.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.3106796116504853, |
| "grad_norm": 0.1570296436548233, |
| "learning_rate": 1e-05, |
| "loss": 0.3862, |
| "num_tokens": 196583549.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.3171521035598706, |
| "grad_norm": 0.14150913059711456, |
| "learning_rate": 1e-05, |
| "loss": 0.3831, |
| "num_tokens": 197555491.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.3236245954692556, |
| "grad_norm": 0.15058490633964539, |
| "learning_rate": 1e-05, |
| "loss": 0.3766, |
| "num_tokens": 198552040.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.3300970873786409, |
| "grad_norm": 0.14992493391036987, |
| "learning_rate": 1e-05, |
| "loss": 0.3775, |
| "num_tokens": 199517007.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.3365695792880259, |
| "grad_norm": 0.14830483496189117, |
| "learning_rate": 1e-05, |
| "loss": 0.3844, |
| "num_tokens": 200490485.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.343042071197411, |
| "grad_norm": 0.1546541452407837, |
| "learning_rate": 1e-05, |
| "loss": 0.3899, |
| "num_tokens": 201459121.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.3495145631067962, |
| "grad_norm": 0.15092389285564423, |
| "learning_rate": 1e-05, |
| "loss": 0.3794, |
| "num_tokens": 202378391.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.3559870550161812, |
| "grad_norm": 0.15256242454051971, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "num_tokens": 203389991.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.3624595469255665, |
| "grad_norm": 0.15065321326255798, |
| "learning_rate": 1e-05, |
| "loss": 0.3854, |
| "num_tokens": 204350028.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.3689320388349515, |
| "grad_norm": 0.1523621529340744, |
| "learning_rate": 1e-05, |
| "loss": 0.3859, |
| "num_tokens": 205321725.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.3754045307443366, |
| "grad_norm": 0.1598656326532364, |
| "learning_rate": 1e-05, |
| "loss": 0.3858, |
| "num_tokens": 206300400.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.3818770226537218, |
| "grad_norm": 0.1452968567609787, |
| "learning_rate": 1e-05, |
| "loss": 0.3812, |
| "num_tokens": 207269349.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.3883495145631068, |
| "grad_norm": 0.15008953213691711, |
| "learning_rate": 1e-05, |
| "loss": 0.3783, |
| "num_tokens": 208239995.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.3948220064724919, |
| "grad_norm": 0.1555267572402954, |
| "learning_rate": 1e-05, |
| "loss": 0.3866, |
| "num_tokens": 209197405.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.4012944983818771, |
| "grad_norm": 0.1534145027399063, |
| "learning_rate": 1e-05, |
| "loss": 0.3788, |
| "num_tokens": 210173069.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.4077669902912622, |
| "grad_norm": 0.14885641634464264, |
| "learning_rate": 1e-05, |
| "loss": 0.3764, |
| "num_tokens": 211140476.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.4142394822006472, |
| "grad_norm": 0.1480827033519745, |
| "learning_rate": 1e-05, |
| "loss": 0.3773, |
| "num_tokens": 212101096.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.4207119741100325, |
| "grad_norm": 0.16137006878852844, |
| "learning_rate": 1e-05, |
| "loss": 0.3844, |
| "num_tokens": 213068720.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.4271844660194175, |
| "grad_norm": 0.1478767842054367, |
| "learning_rate": 1e-05, |
| "loss": 0.3827, |
| "num_tokens": 214020099.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.4336569579288025, |
| "grad_norm": 0.16222462058067322, |
| "learning_rate": 1e-05, |
| "loss": 0.3761, |
| "num_tokens": 214947806.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.4401294498381878, |
| "grad_norm": 0.1649448126554489, |
| "learning_rate": 1e-05, |
| "loss": 0.3794, |
| "num_tokens": 215925598.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.4466019417475728, |
| "grad_norm": 0.14599865674972534, |
| "learning_rate": 1e-05, |
| "loss": 0.374, |
| "num_tokens": 216915650.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.4530744336569579, |
| "grad_norm": 0.16679567098617554, |
| "learning_rate": 1e-05, |
| "loss": 0.3823, |
| "num_tokens": 217865373.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.4595469255663431, |
| "grad_norm": 0.15487605333328247, |
| "learning_rate": 1e-05, |
| "loss": 0.373, |
| "num_tokens": 218823612.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.4660194174757282, |
| "grad_norm": 0.1633458137512207, |
| "learning_rate": 1e-05, |
| "loss": 0.3722, |
| "num_tokens": 219799962.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.4724919093851132, |
| "grad_norm": 0.15614818036556244, |
| "learning_rate": 1e-05, |
| "loss": 0.3828, |
| "num_tokens": 220771199.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.4789644012944985, |
| "grad_norm": 0.17319650948047638, |
| "learning_rate": 1e-05, |
| "loss": 0.3777, |
| "num_tokens": 221736326.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.4854368932038835, |
| "grad_norm": 0.16175642609596252, |
| "learning_rate": 1e-05, |
| "loss": 0.377, |
| "num_tokens": 222710609.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.4919093851132685, |
| "grad_norm": 0.16791830956935883, |
| "learning_rate": 1e-05, |
| "loss": 0.3739, |
| "num_tokens": 223688658.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.4983818770226538, |
| "grad_norm": 0.15006020665168762, |
| "learning_rate": 1e-05, |
| "loss": 0.3789, |
| "num_tokens": 224689265.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.5048543689320388, |
| "grad_norm": 0.16425134241580963, |
| "learning_rate": 1e-05, |
| "loss": 0.3769, |
| "num_tokens": 225643475.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.5113268608414239, |
| "grad_norm": 0.14565372467041016, |
| "learning_rate": 1e-05, |
| "loss": 0.3807, |
| "num_tokens": 226617652.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.5177993527508091, |
| "grad_norm": 0.16489112377166748, |
| "learning_rate": 1e-05, |
| "loss": 0.3755, |
| "num_tokens": 227553267.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.5242718446601942, |
| "grad_norm": 0.15542085468769073, |
| "learning_rate": 1e-05, |
| "loss": 0.3778, |
| "num_tokens": 228538404.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.5307443365695792, |
| "grad_norm": 0.16758009791374207, |
| "learning_rate": 1e-05, |
| "loss": 0.3741, |
| "num_tokens": 229526832.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.5372168284789645, |
| "grad_norm": 0.15440639853477478, |
| "learning_rate": 1e-05, |
| "loss": 0.3791, |
| "num_tokens": 230514566.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.5436893203883495, |
| "grad_norm": 0.16300874948501587, |
| "learning_rate": 1e-05, |
| "loss": 0.3759, |
| "num_tokens": 231499488.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.5501618122977345, |
| "grad_norm": 0.1678025871515274, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "num_tokens": 232467260.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.5566343042071198, |
| "grad_norm": 0.15477962791919708, |
| "learning_rate": 1e-05, |
| "loss": 0.3786, |
| "num_tokens": 233453448.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.5631067961165048, |
| "grad_norm": 0.16532817482948303, |
| "learning_rate": 1e-05, |
| "loss": 0.3729, |
| "num_tokens": 234468336.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.5695792880258899, |
| "grad_norm": 0.15214623510837555, |
| "learning_rate": 1e-05, |
| "loss": 0.3826, |
| "num_tokens": 235437157.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.5760517799352751, |
| "grad_norm": 0.16525112092494965, |
| "learning_rate": 1e-05, |
| "loss": 0.3836, |
| "num_tokens": 236420720.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.5825242718446602, |
| "grad_norm": 0.164701908826828, |
| "learning_rate": 1e-05, |
| "loss": 0.3801, |
| "num_tokens": 237408717.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.5889967637540452, |
| "grad_norm": 0.1614416241645813, |
| "learning_rate": 1e-05, |
| "loss": 0.3835, |
| "num_tokens": 238399497.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.5954692556634305, |
| "grad_norm": 0.17205291986465454, |
| "learning_rate": 1e-05, |
| "loss": 0.385, |
| "num_tokens": 239341614.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.6019417475728155, |
| "grad_norm": 0.172869473695755, |
| "learning_rate": 1e-05, |
| "loss": 0.378, |
| "num_tokens": 240303758.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.6084142394822005, |
| "grad_norm": 0.170328289270401, |
| "learning_rate": 1e-05, |
| "loss": 0.378, |
| "num_tokens": 241254594.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.6148867313915858, |
| "grad_norm": 0.15210796892642975, |
| "learning_rate": 1e-05, |
| "loss": 0.3825, |
| "num_tokens": 242199383.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6213592233009708, |
| "grad_norm": 0.17345553636550903, |
| "learning_rate": 1e-05, |
| "loss": 0.376, |
| "num_tokens": 243205373.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.6278317152103559, |
| "grad_norm": 0.15487349033355713, |
| "learning_rate": 1e-05, |
| "loss": 0.371, |
| "num_tokens": 244177909.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.6343042071197411, |
| "grad_norm": 0.1735333651304245, |
| "learning_rate": 1e-05, |
| "loss": 0.3826, |
| "num_tokens": 245165218.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.6407766990291264, |
| "grad_norm": 0.1656838059425354, |
| "learning_rate": 1e-05, |
| "loss": 0.3714, |
| "num_tokens": 246106607.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.6472491909385112, |
| "grad_norm": 0.16804338991641998, |
| "learning_rate": 1e-05, |
| "loss": 0.3777, |
| "num_tokens": 247086207.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.6537216828478964, |
| "grad_norm": 0.15802405774593353, |
| "learning_rate": 1e-05, |
| "loss": 0.3811, |
| "num_tokens": 248068946.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.6601941747572817, |
| "grad_norm": 0.16986878216266632, |
| "learning_rate": 1e-05, |
| "loss": 0.372, |
| "num_tokens": 249047854.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.1500914990901947, |
| "learning_rate": 1e-05, |
| "loss": 0.3844, |
| "num_tokens": 250033293.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.6731391585760518, |
| "grad_norm": 0.15746842324733734, |
| "learning_rate": 1e-05, |
| "loss": 0.3799, |
| "num_tokens": 251008614.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.679611650485437, |
| "grad_norm": 0.15219233930110931, |
| "learning_rate": 1e-05, |
| "loss": 0.3728, |
| "num_tokens": 251959677.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.6860841423948218, |
| "grad_norm": 0.1517256200313568, |
| "learning_rate": 1e-05, |
| "loss": 0.3789, |
| "num_tokens": 252935571.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.692556634304207, |
| "grad_norm": 0.15222369134426117, |
| "learning_rate": 1e-05, |
| "loss": 0.3806, |
| "num_tokens": 253879032.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.6990291262135924, |
| "grad_norm": 0.15999265015125275, |
| "learning_rate": 1e-05, |
| "loss": 0.3646, |
| "num_tokens": 254826418.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.7055016181229772, |
| "grad_norm": 0.160135418176651, |
| "learning_rate": 1e-05, |
| "loss": 0.3817, |
| "num_tokens": 255785864.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.7119741100323624, |
| "grad_norm": 0.1449073851108551, |
| "learning_rate": 1e-05, |
| "loss": 0.3722, |
| "num_tokens": 256751813.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.7184466019417477, |
| "grad_norm": 0.15163810551166534, |
| "learning_rate": 1e-05, |
| "loss": 0.3802, |
| "num_tokens": 257721834.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.7249190938511327, |
| "grad_norm": 0.14865443110466003, |
| "learning_rate": 1e-05, |
| "loss": 0.3796, |
| "num_tokens": 258673121.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.7313915857605178, |
| "grad_norm": 0.15070468187332153, |
| "learning_rate": 1e-05, |
| "loss": 0.3842, |
| "num_tokens": 259660088.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.737864077669903, |
| "grad_norm": 0.1530027985572815, |
| "learning_rate": 1e-05, |
| "loss": 0.3846, |
| "num_tokens": 260659515.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.744336569579288, |
| "grad_norm": 0.17373213171958923, |
| "learning_rate": 1e-05, |
| "loss": 0.3787, |
| "num_tokens": 261673283.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.750809061488673, |
| "grad_norm": 0.17764431238174438, |
| "learning_rate": 1e-05, |
| "loss": 0.3714, |
| "num_tokens": 262663793.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.7572815533980584, |
| "grad_norm": 0.1590733826160431, |
| "learning_rate": 1e-05, |
| "loss": 0.3766, |
| "num_tokens": 263631573.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.7637540453074434, |
| "grad_norm": 0.18542887270450592, |
| "learning_rate": 1e-05, |
| "loss": 0.3827, |
| "num_tokens": 264625690.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.7702265372168284, |
| "grad_norm": 0.17420196533203125, |
| "learning_rate": 1e-05, |
| "loss": 0.3663, |
| "num_tokens": 265585386.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.7766990291262137, |
| "grad_norm": 0.14788936078548431, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "num_tokens": 266565331.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.7831715210355987, |
| "grad_norm": 0.17412249743938446, |
| "learning_rate": 1e-05, |
| "loss": 0.3754, |
| "num_tokens": 267570388.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.7896440129449838, |
| "grad_norm": 0.1655820608139038, |
| "learning_rate": 1e-05, |
| "loss": 0.3748, |
| "num_tokens": 268536852.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.796116504854369, |
| "grad_norm": 0.15587899088859558, |
| "learning_rate": 1e-05, |
| "loss": 0.3827, |
| "num_tokens": 269499895.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.802588996763754, |
| "grad_norm": 0.1700071096420288, |
| "learning_rate": 1e-05, |
| "loss": 0.3708, |
| "num_tokens": 270449454.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.809061488673139, |
| "grad_norm": 0.1482965350151062, |
| "learning_rate": 1e-05, |
| "loss": 0.3701, |
| "num_tokens": 271445748.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.8155339805825244, |
| "grad_norm": 0.16300396621227264, |
| "learning_rate": 1e-05, |
| "loss": 0.3764, |
| "num_tokens": 272437678.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.8220064724919094, |
| "grad_norm": 0.14801423251628876, |
| "learning_rate": 1e-05, |
| "loss": 0.3806, |
| "num_tokens": 273421632.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.8284789644012944, |
| "grad_norm": 0.1659514456987381, |
| "learning_rate": 1e-05, |
| "loss": 0.369, |
| "num_tokens": 274382121.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.8349514563106797, |
| "grad_norm": 0.14842261373996735, |
| "learning_rate": 1e-05, |
| "loss": 0.3658, |
| "num_tokens": 275355409.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.8414239482200647, |
| "grad_norm": 0.14394618570804596, |
| "learning_rate": 1e-05, |
| "loss": 0.3714, |
| "num_tokens": 276363366.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.8478964401294498, |
| "grad_norm": 0.14794841408729553, |
| "learning_rate": 1e-05, |
| "loss": 0.3768, |
| "num_tokens": 277316291.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.854368932038835, |
| "grad_norm": 0.15567384660243988, |
| "learning_rate": 1e-05, |
| "loss": 0.3742, |
| "num_tokens": 278277485.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.86084142394822, |
| "grad_norm": 0.13822609186172485, |
| "learning_rate": 1e-05, |
| "loss": 0.3689, |
| "num_tokens": 279263638.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.867313915857605, |
| "grad_norm": 0.1645592600107193, |
| "learning_rate": 1e-05, |
| "loss": 0.3685, |
| "num_tokens": 280229543.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.8737864077669903, |
| "grad_norm": 0.1565285474061966, |
| "learning_rate": 1e-05, |
| "loss": 0.3681, |
| "num_tokens": 281207364.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.8802588996763754, |
| "grad_norm": 0.15756022930145264, |
| "learning_rate": 1e-05, |
| "loss": 0.3668, |
| "num_tokens": 282156358.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.8867313915857604, |
| "grad_norm": 0.15506701171398163, |
| "learning_rate": 1e-05, |
| "loss": 0.3697, |
| "num_tokens": 283119435.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.8932038834951457, |
| "grad_norm": 0.15256839990615845, |
| "learning_rate": 1e-05, |
| "loss": 0.3793, |
| "num_tokens": 284084676.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.8996763754045307, |
| "grad_norm": 0.16919973492622375, |
| "learning_rate": 1e-05, |
| "loss": 0.3895, |
| "num_tokens": 285051682.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.9061488673139158, |
| "grad_norm": 0.1467684656381607, |
| "learning_rate": 1e-05, |
| "loss": 0.3741, |
| "num_tokens": 286037634.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.912621359223301, |
| "grad_norm": 0.1547224074602127, |
| "learning_rate": 1e-05, |
| "loss": 0.3677, |
| "num_tokens": 287028258.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.919093851132686, |
| "grad_norm": 0.14955021440982819, |
| "learning_rate": 1e-05, |
| "loss": 0.371, |
| "num_tokens": 287996979.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.925566343042071, |
| "grad_norm": 0.14600345492362976, |
| "learning_rate": 1e-05, |
| "loss": 0.3704, |
| "num_tokens": 288984893.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.9320388349514563, |
| "grad_norm": 0.15038008987903595, |
| "learning_rate": 1e-05, |
| "loss": 0.3728, |
| "num_tokens": 289948988.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.9385113268608414, |
| "grad_norm": 0.1539427489042282, |
| "learning_rate": 1e-05, |
| "loss": 0.3786, |
| "num_tokens": 290947258.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9449838187702264, |
| "grad_norm": 0.1640380322933197, |
| "learning_rate": 1e-05, |
| "loss": 0.3735, |
| "num_tokens": 291896825.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.9514563106796117, |
| "grad_norm": 0.13670173287391663, |
| "learning_rate": 1e-05, |
| "loss": 0.3639, |
| "num_tokens": 292874918.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.9579288025889967, |
| "grad_norm": 0.1619482934474945, |
| "learning_rate": 1e-05, |
| "loss": 0.3761, |
| "num_tokens": 293858095.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.9644012944983817, |
| "grad_norm": 0.14659424126148224, |
| "learning_rate": 1e-05, |
| "loss": 0.3793, |
| "num_tokens": 294831363.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.970873786407767, |
| "grad_norm": 0.1463099718093872, |
| "learning_rate": 1e-05, |
| "loss": 0.3667, |
| "num_tokens": 295815029.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.9773462783171523, |
| "grad_norm": 0.15758393704891205, |
| "learning_rate": 1e-05, |
| "loss": 0.381, |
| "num_tokens": 296783225.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.983818770226537, |
| "grad_norm": 0.14858050644397736, |
| "learning_rate": 1e-05, |
| "loss": 0.37, |
| "num_tokens": 297740638.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.9902912621359223, |
| "grad_norm": 0.1502712517976761, |
| "learning_rate": 1e-05, |
| "loss": 0.382, |
| "num_tokens": 298713508.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.9902912621359223, |
| "step": 308, |
| "total_flos": 1.2344471118126514e+19, |
| "train_loss": 0.44806588683035464, |
| "train_runtime": 9308.2443, |
| "train_samples_per_second": 14.854, |
| "train_steps_per_second": 0.033 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 308, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 16, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2344471118126514e+19, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|