| { |
| "best_global_step": 2144, |
| "best_metric": 0.6239609122276306, |
| "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_conala_1756729619/checkpoint-2144", |
| "epoch": 10.0, |
| "eval_steps": 536, |
| "global_step": 10710, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004668534080298786, |
| "grad_norm": 84.6258544921875, |
| "learning_rate": 1.8674136321195148e-07, |
| "loss": 8.5449, |
| "num_input_tokens_seen": 576, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.009337068160597572, |
| "grad_norm": 76.01036834716797, |
| "learning_rate": 4.2016806722689076e-07, |
| "loss": 8.3941, |
| "num_input_tokens_seen": 1168, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.014005602240896359, |
| "grad_norm": 99.43428802490234, |
| "learning_rate": 6.535947712418301e-07, |
| "loss": 8.6591, |
| "num_input_tokens_seen": 1824, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.018674136321195144, |
| "grad_norm": 103.66761016845703, |
| "learning_rate": 8.870214752567694e-07, |
| "loss": 8.3524, |
| "num_input_tokens_seen": 2384, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.02334267040149393, |
| "grad_norm": 55.7948112487793, |
| "learning_rate": 1.1204481792717088e-06, |
| "loss": 8.3068, |
| "num_input_tokens_seen": 3040, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.028011204481792718, |
| "grad_norm": 88.3673095703125, |
| "learning_rate": 1.3538748832866481e-06, |
| "loss": 7.901, |
| "num_input_tokens_seen": 3776, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.032679738562091505, |
| "grad_norm": 95.092041015625, |
| "learning_rate": 1.5873015873015873e-06, |
| "loss": 7.813, |
| "num_input_tokens_seen": 4336, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.03734827264239029, |
| "grad_norm": 76.32391357421875, |
| "learning_rate": 1.820728291316527e-06, |
| "loss": 6.4782, |
| "num_input_tokens_seen": 4944, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04201680672268908, |
| "grad_norm": 52.776458740234375, |
| "learning_rate": 2.054154995331466e-06, |
| "loss": 6.6237, |
| "num_input_tokens_seen": 5536, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.04668534080298786, |
| "grad_norm": 45.56707000732422, |
| "learning_rate": 2.2875816993464053e-06, |
| "loss": 5.8744, |
| "num_input_tokens_seen": 6208, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.051353874883286646, |
| "grad_norm": 53.54768753051758, |
| "learning_rate": 2.521008403361345e-06, |
| "loss": 5.7237, |
| "num_input_tokens_seen": 6848, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.056022408963585436, |
| "grad_norm": 46.92519760131836, |
| "learning_rate": 2.754435107376284e-06, |
| "loss": 4.7982, |
| "num_input_tokens_seen": 7584, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06069094304388422, |
| "grad_norm": 60.48017883300781, |
| "learning_rate": 2.9878618113912236e-06, |
| "loss": 5.0754, |
| "num_input_tokens_seen": 8176, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.06535947712418301, |
| "grad_norm": 47.011898040771484, |
| "learning_rate": 3.2212885154061624e-06, |
| "loss": 5.0185, |
| "num_input_tokens_seen": 8720, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0700280112044818, |
| "grad_norm": 75.90699768066406, |
| "learning_rate": 3.454715219421102e-06, |
| "loss": 5.0108, |
| "num_input_tokens_seen": 9312, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.07469654528478058, |
| "grad_norm": 47.10323715209961, |
| "learning_rate": 3.688141923436041e-06, |
| "loss": 5.2044, |
| "num_input_tokens_seen": 9888, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.07936507936507936, |
| "grad_norm": 43.27751541137695, |
| "learning_rate": 3.92156862745098e-06, |
| "loss": 4.6655, |
| "num_input_tokens_seen": 10480, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.08403361344537816, |
| "grad_norm": 49.48586654663086, |
| "learning_rate": 4.15499533146592e-06, |
| "loss": 4.8074, |
| "num_input_tokens_seen": 11088, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.08870214752567694, |
| "grad_norm": 42.18975830078125, |
| "learning_rate": 4.388422035480859e-06, |
| "loss": 3.6381, |
| "num_input_tokens_seen": 11872, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.09337068160597572, |
| "grad_norm": 43.712135314941406, |
| "learning_rate": 4.621848739495799e-06, |
| "loss": 4.0669, |
| "num_input_tokens_seen": 12608, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.09803921568627451, |
| "grad_norm": 47.92484664916992, |
| "learning_rate": 4.855275443510738e-06, |
| "loss": 4.4186, |
| "num_input_tokens_seen": 13280, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.10270774976657329, |
| "grad_norm": 42.59040069580078, |
| "learning_rate": 5.0887021475256775e-06, |
| "loss": 3.6303, |
| "num_input_tokens_seen": 13952, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.10737628384687208, |
| "grad_norm": 41.3209342956543, |
| "learning_rate": 5.322128851540616e-06, |
| "loss": 3.4098, |
| "num_input_tokens_seen": 14592, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.11204481792717087, |
| "grad_norm": 66.64546966552734, |
| "learning_rate": 5.555555555555556e-06, |
| "loss": 3.858, |
| "num_input_tokens_seen": 15248, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.11671335200746966, |
| "grad_norm": 78.6478042602539, |
| "learning_rate": 5.788982259570495e-06, |
| "loss": 3.4176, |
| "num_input_tokens_seen": 15904, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.12138188608776844, |
| "grad_norm": 44.221187591552734, |
| "learning_rate": 6.022408963585434e-06, |
| "loss": 3.6577, |
| "num_input_tokens_seen": 16576, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.12605042016806722, |
| "grad_norm": 42.961875915527344, |
| "learning_rate": 6.255835667600374e-06, |
| "loss": 3.1459, |
| "num_input_tokens_seen": 17216, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.13071895424836602, |
| "grad_norm": 42.52253341674805, |
| "learning_rate": 6.489262371615313e-06, |
| "loss": 3.5311, |
| "num_input_tokens_seen": 17888, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1353874883286648, |
| "grad_norm": 34.132728576660156, |
| "learning_rate": 6.722689075630252e-06, |
| "loss": 3.344, |
| "num_input_tokens_seen": 18544, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.1400560224089636, |
| "grad_norm": 45.744293212890625, |
| "learning_rate": 6.956115779645192e-06, |
| "loss": 3.1912, |
| "num_input_tokens_seen": 19184, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.14472455648926238, |
| "grad_norm": 55.98895263671875, |
| "learning_rate": 7.1895424836601305e-06, |
| "loss": 2.7634, |
| "num_input_tokens_seen": 19760, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.14939309056956115, |
| "grad_norm": 48.209251403808594, |
| "learning_rate": 7.42296918767507e-06, |
| "loss": 2.8525, |
| "num_input_tokens_seen": 20432, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.15406162464985995, |
| "grad_norm": 55.19756317138672, |
| "learning_rate": 7.65639589169001e-06, |
| "loss": 2.4636, |
| "num_input_tokens_seen": 21120, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.15873015873015872, |
| "grad_norm": 39.187644958496094, |
| "learning_rate": 7.889822595704948e-06, |
| "loss": 2.7641, |
| "num_input_tokens_seen": 21744, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.16339869281045752, |
| "grad_norm": 37.96087646484375, |
| "learning_rate": 8.123249299719889e-06, |
| "loss": 2.7158, |
| "num_input_tokens_seen": 22432, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.16806722689075632, |
| "grad_norm": 41.540382385253906, |
| "learning_rate": 8.356676003734828e-06, |
| "loss": 2.1432, |
| "num_input_tokens_seen": 23216, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.17273576097105509, |
| "grad_norm": 52.784114837646484, |
| "learning_rate": 8.590102707749766e-06, |
| "loss": 2.309, |
| "num_input_tokens_seen": 23856, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.17740429505135388, |
| "grad_norm": 43.1025390625, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 2.2712, |
| "num_input_tokens_seen": 24480, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.18207282913165265, |
| "grad_norm": 23.09410285949707, |
| "learning_rate": 9.056956115779646e-06, |
| "loss": 2.2288, |
| "num_input_tokens_seen": 25184, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.18674136321195145, |
| "grad_norm": 43.10409164428711, |
| "learning_rate": 9.290382819794586e-06, |
| "loss": 2.5402, |
| "num_input_tokens_seen": 25872, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.19140989729225025, |
| "grad_norm": 37.84174346923828, |
| "learning_rate": 9.523809523809523e-06, |
| "loss": 2.5288, |
| "num_input_tokens_seen": 26544, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.19607843137254902, |
| "grad_norm": 50.47822570800781, |
| "learning_rate": 9.757236227824464e-06, |
| "loss": 2.9575, |
| "num_input_tokens_seen": 27152, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.20074696545284781, |
| "grad_norm": 42.75927734375, |
| "learning_rate": 9.990662931839404e-06, |
| "loss": 2.4739, |
| "num_input_tokens_seen": 27744, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.20541549953314658, |
| "grad_norm": 41.93454360961914, |
| "learning_rate": 1.0224089635854343e-05, |
| "loss": 2.2127, |
| "num_input_tokens_seen": 28368, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.21008403361344538, |
| "grad_norm": 52.05291748046875, |
| "learning_rate": 1.0457516339869281e-05, |
| "loss": 2.1488, |
| "num_input_tokens_seen": 29008, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.21475256769374415, |
| "grad_norm": 48.70277786254883, |
| "learning_rate": 1.069094304388422e-05, |
| "loss": 2.0124, |
| "num_input_tokens_seen": 29648, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.21942110177404295, |
| "grad_norm": 58.86783981323242, |
| "learning_rate": 1.092436974789916e-05, |
| "loss": 2.3292, |
| "num_input_tokens_seen": 30288, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.22408963585434175, |
| "grad_norm": 73.74400329589844, |
| "learning_rate": 1.11577964519141e-05, |
| "loss": 2.465, |
| "num_input_tokens_seen": 30960, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.22875816993464052, |
| "grad_norm": 44.3712043762207, |
| "learning_rate": 1.1391223155929038e-05, |
| "loss": 2.0996, |
| "num_input_tokens_seen": 31648, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.2334267040149393, |
| "grad_norm": 38.637699127197266, |
| "learning_rate": 1.1624649859943979e-05, |
| "loss": 2.0566, |
| "num_input_tokens_seen": 32304, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 48.099578857421875, |
| "learning_rate": 1.1858076563958917e-05, |
| "loss": 2.099, |
| "num_input_tokens_seen": 32880, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.24276377217553688, |
| "grad_norm": 32.05073547363281, |
| "learning_rate": 1.2091503267973856e-05, |
| "loss": 1.7378, |
| "num_input_tokens_seen": 33504, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.24743230625583568, |
| "grad_norm": 38.621734619140625, |
| "learning_rate": 1.2324929971988797e-05, |
| "loss": 2.0952, |
| "num_input_tokens_seen": 34128, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.25210084033613445, |
| "grad_norm": 31.620946884155273, |
| "learning_rate": 1.2558356676003735e-05, |
| "loss": 1.3152, |
| "num_input_tokens_seen": 34864, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.2567693744164332, |
| "grad_norm": 36.46520233154297, |
| "learning_rate": 1.2791783380018674e-05, |
| "loss": 1.7895, |
| "num_input_tokens_seen": 35472, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.26143790849673204, |
| "grad_norm": 60.55278396606445, |
| "learning_rate": 1.3025210084033614e-05, |
| "loss": 1.9903, |
| "num_input_tokens_seen": 36144, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.2661064425770308, |
| "grad_norm": 26.2589111328125, |
| "learning_rate": 1.3258636788048553e-05, |
| "loss": 1.2714, |
| "num_input_tokens_seen": 36768, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.2707749766573296, |
| "grad_norm": 47.84441375732422, |
| "learning_rate": 1.3492063492063492e-05, |
| "loss": 1.3887, |
| "num_input_tokens_seen": 37424, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.2754435107376284, |
| "grad_norm": 36.61695098876953, |
| "learning_rate": 1.3725490196078432e-05, |
| "loss": 1.4811, |
| "num_input_tokens_seen": 38096, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.2801120448179272, |
| "grad_norm": 38.53045654296875, |
| "learning_rate": 1.3958916900093371e-05, |
| "loss": 1.9248, |
| "num_input_tokens_seen": 38736, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.28478057889822594, |
| "grad_norm": 44.79354476928711, |
| "learning_rate": 1.419234360410831e-05, |
| "loss": 1.134, |
| "num_input_tokens_seen": 39424, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.28944911297852477, |
| "grad_norm": 46.52348709106445, |
| "learning_rate": 1.4425770308123249e-05, |
| "loss": 1.3332, |
| "num_input_tokens_seen": 40160, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 29.020601272583008, |
| "learning_rate": 1.4659197012138189e-05, |
| "loss": 1.307, |
| "num_input_tokens_seen": 40768, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.2987861811391223, |
| "grad_norm": 47.507301330566406, |
| "learning_rate": 1.4892623716153128e-05, |
| "loss": 1.7801, |
| "num_input_tokens_seen": 41376, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3034547152194211, |
| "grad_norm": 30.17751121520996, |
| "learning_rate": 1.5126050420168067e-05, |
| "loss": 1.1193, |
| "num_input_tokens_seen": 42144, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.3081232492997199, |
| "grad_norm": 24.388837814331055, |
| "learning_rate": 1.5359477124183007e-05, |
| "loss": 1.0519, |
| "num_input_tokens_seen": 42880, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.3127917833800187, |
| "grad_norm": 26.254653930664062, |
| "learning_rate": 1.5592903828197946e-05, |
| "loss": 1.2371, |
| "num_input_tokens_seen": 43472, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.31746031746031744, |
| "grad_norm": 32.2711296081543, |
| "learning_rate": 1.5826330532212885e-05, |
| "loss": 1.2114, |
| "num_input_tokens_seen": 44064, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.32212885154061627, |
| "grad_norm": 23.010696411132812, |
| "learning_rate": 1.6059757236227827e-05, |
| "loss": 1.5254, |
| "num_input_tokens_seen": 44688, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.32679738562091504, |
| "grad_norm": 14.727700233459473, |
| "learning_rate": 1.6293183940242765e-05, |
| "loss": 1.0906, |
| "num_input_tokens_seen": 45296, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.3314659197012138, |
| "grad_norm": 25.24241065979004, |
| "learning_rate": 1.6526610644257704e-05, |
| "loss": 1.4038, |
| "num_input_tokens_seen": 45920, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.33613445378151263, |
| "grad_norm": 28.438600540161133, |
| "learning_rate": 1.6760037348272643e-05, |
| "loss": 1.1871, |
| "num_input_tokens_seen": 46496, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.3408029878618114, |
| "grad_norm": 33.14903259277344, |
| "learning_rate": 1.6993464052287582e-05, |
| "loss": 1.6374, |
| "num_input_tokens_seen": 47088, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.34547152194211017, |
| "grad_norm": 23.363435745239258, |
| "learning_rate": 1.722689075630252e-05, |
| "loss": 1.5303, |
| "num_input_tokens_seen": 47680, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.35014005602240894, |
| "grad_norm": 19.318904876708984, |
| "learning_rate": 1.746031746031746e-05, |
| "loss": 0.7557, |
| "num_input_tokens_seen": 48320, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.35480859010270777, |
| "grad_norm": 7.826172828674316, |
| "learning_rate": 1.76937441643324e-05, |
| "loss": 0.8215, |
| "num_input_tokens_seen": 48976, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.35947712418300654, |
| "grad_norm": 17.366634368896484, |
| "learning_rate": 1.792717086834734e-05, |
| "loss": 1.5873, |
| "num_input_tokens_seen": 49536, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.3641456582633053, |
| "grad_norm": 21.24698829650879, |
| "learning_rate": 1.816059757236228e-05, |
| "loss": 1.1401, |
| "num_input_tokens_seen": 50160, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.36881419234360413, |
| "grad_norm": 12.986654281616211, |
| "learning_rate": 1.839402427637722e-05, |
| "loss": 0.6504, |
| "num_input_tokens_seen": 50864, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.3734827264239029, |
| "grad_norm": 24.244192123413086, |
| "learning_rate": 1.862745098039216e-05, |
| "loss": 1.4523, |
| "num_input_tokens_seen": 51552, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.37815126050420167, |
| "grad_norm": 12.188508987426758, |
| "learning_rate": 1.8860877684407095e-05, |
| "loss": 0.8425, |
| "num_input_tokens_seen": 52208, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.3828197945845005, |
| "grad_norm": 37.367069244384766, |
| "learning_rate": 1.9094304388422034e-05, |
| "loss": 1.1623, |
| "num_input_tokens_seen": 52864, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.38748832866479926, |
| "grad_norm": 39.64646530151367, |
| "learning_rate": 1.9327731092436976e-05, |
| "loss": 1.0445, |
| "num_input_tokens_seen": 53456, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.39215686274509803, |
| "grad_norm": 23.681903839111328, |
| "learning_rate": 1.9561157796451915e-05, |
| "loss": 1.414, |
| "num_input_tokens_seen": 54080, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.3968253968253968, |
| "grad_norm": 15.80068302154541, |
| "learning_rate": 1.9794584500466854e-05, |
| "loss": 0.842, |
| "num_input_tokens_seen": 54720, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.40149393090569563, |
| "grad_norm": 23.32377815246582, |
| "learning_rate": 2.0028011204481796e-05, |
| "loss": 0.7494, |
| "num_input_tokens_seen": 55328, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.4061624649859944, |
| "grad_norm": 6.138006687164307, |
| "learning_rate": 2.0261437908496734e-05, |
| "loss": 0.9735, |
| "num_input_tokens_seen": 55936, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.41083099906629317, |
| "grad_norm": 124.19203186035156, |
| "learning_rate": 2.0494864612511673e-05, |
| "loss": 0.9148, |
| "num_input_tokens_seen": 56608, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.415499533146592, |
| "grad_norm": 16.503829956054688, |
| "learning_rate": 2.0728291316526612e-05, |
| "loss": 0.9937, |
| "num_input_tokens_seen": 57264, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.42016806722689076, |
| "grad_norm": 64.5467758178711, |
| "learning_rate": 2.096171802054155e-05, |
| "loss": 0.6676, |
| "num_input_tokens_seen": 57856, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.42483660130718953, |
| "grad_norm": 9.891252517700195, |
| "learning_rate": 2.119514472455649e-05, |
| "loss": 0.5961, |
| "num_input_tokens_seen": 58448, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.4295051353874883, |
| "grad_norm": 45.48982620239258, |
| "learning_rate": 2.1428571428571428e-05, |
| "loss": 1.1232, |
| "num_input_tokens_seen": 59072, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.4341736694677871, |
| "grad_norm": 11.444414138793945, |
| "learning_rate": 2.166199813258637e-05, |
| "loss": 0.5722, |
| "num_input_tokens_seen": 59744, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.4388422035480859, |
| "grad_norm": 19.90337371826172, |
| "learning_rate": 2.189542483660131e-05, |
| "loss": 1.4224, |
| "num_input_tokens_seen": 60336, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.44351073762838467, |
| "grad_norm": 14.948549270629883, |
| "learning_rate": 2.2128851540616248e-05, |
| "loss": 0.5425, |
| "num_input_tokens_seen": 60896, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.4481792717086835, |
| "grad_norm": 6.758205890655518, |
| "learning_rate": 2.2362278244631187e-05, |
| "loss": 1.067, |
| "num_input_tokens_seen": 61600, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.45284780578898226, |
| "grad_norm": 12.607068061828613, |
| "learning_rate": 2.2595704948646125e-05, |
| "loss": 1.2737, |
| "num_input_tokens_seen": 62272, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.45751633986928103, |
| "grad_norm": 15.035085678100586, |
| "learning_rate": 2.2829131652661064e-05, |
| "loss": 0.8138, |
| "num_input_tokens_seen": 62784, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.46218487394957986, |
| "grad_norm": 14.877044677734375, |
| "learning_rate": 2.3062558356676006e-05, |
| "loss": 1.0438, |
| "num_input_tokens_seen": 63472, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.4668534080298786, |
| "grad_norm": 14.44520092010498, |
| "learning_rate": 2.3295985060690945e-05, |
| "loss": 1.2706, |
| "num_input_tokens_seen": 64192, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4715219421101774, |
| "grad_norm": 11.816399574279785, |
| "learning_rate": 2.3529411764705884e-05, |
| "loss": 1.0394, |
| "num_input_tokens_seen": 64880, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 13.24144172668457, |
| "learning_rate": 2.3762838468720822e-05, |
| "loss": 0.6415, |
| "num_input_tokens_seen": 65536, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.480859010270775, |
| "grad_norm": 13.559804916381836, |
| "learning_rate": 2.3996265172735765e-05, |
| "loss": 1.3095, |
| "num_input_tokens_seen": 66176, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.48552754435107376, |
| "grad_norm": 7.28280782699585, |
| "learning_rate": 2.42296918767507e-05, |
| "loss": 1.1768, |
| "num_input_tokens_seen": 66768, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.49019607843137253, |
| "grad_norm": 5.257969856262207, |
| "learning_rate": 2.446311858076564e-05, |
| "loss": 0.4696, |
| "num_input_tokens_seen": 67520, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.49486461251167135, |
| "grad_norm": 6.459392070770264, |
| "learning_rate": 2.469654528478058e-05, |
| "loss": 1.0809, |
| "num_input_tokens_seen": 68144, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.4995331465919701, |
| "grad_norm": 8.934903144836426, |
| "learning_rate": 2.492997198879552e-05, |
| "loss": 0.9926, |
| "num_input_tokens_seen": 68752, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.5004668534080299, |
| "eval_loss": 0.8520299792289734, |
| "eval_runtime": 3.8914, |
| "eval_samples_per_second": 61.161, |
| "eval_steps_per_second": 30.58, |
| "num_input_tokens_seen": 68880, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.5042016806722689, |
| "grad_norm": 11.780537605285645, |
| "learning_rate": 2.516339869281046e-05, |
| "loss": 0.8823, |
| "num_input_tokens_seen": 69424, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5088702147525677, |
| "grad_norm": 8.576756477355957, |
| "learning_rate": 2.5396825396825397e-05, |
| "loss": 0.6315, |
| "num_input_tokens_seen": 70128, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.5135387488328664, |
| "grad_norm": 6.6480584144592285, |
| "learning_rate": 2.5630252100840336e-05, |
| "loss": 0.7986, |
| "num_input_tokens_seen": 70784, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.5182072829131653, |
| "grad_norm": 16.78489112854004, |
| "learning_rate": 2.5863678804855278e-05, |
| "loss": 0.679, |
| "num_input_tokens_seen": 71456, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5228758169934641, |
| "grad_norm": 14.316130638122559, |
| "learning_rate": 2.6097105508870217e-05, |
| "loss": 0.7054, |
| "num_input_tokens_seen": 72096, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.5275443510737629, |
| "grad_norm": 13.169844627380371, |
| "learning_rate": 2.6330532212885155e-05, |
| "loss": 1.5116, |
| "num_input_tokens_seen": 72784, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.5322128851540616, |
| "grad_norm": 16.326101303100586, |
| "learning_rate": 2.6563958916900094e-05, |
| "loss": 0.9846, |
| "num_input_tokens_seen": 73440, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.5368814192343604, |
| "grad_norm": 13.680533409118652, |
| "learning_rate": 2.6797385620915033e-05, |
| "loss": 0.6246, |
| "num_input_tokens_seen": 74096, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.5415499533146592, |
| "grad_norm": 19.503694534301758, |
| "learning_rate": 2.7030812324929972e-05, |
| "loss": 1.4243, |
| "num_input_tokens_seen": 74800, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.5462184873949579, |
| "grad_norm": 7.272068977355957, |
| "learning_rate": 2.726423902894491e-05, |
| "loss": 0.5295, |
| "num_input_tokens_seen": 75344, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.5508870214752568, |
| "grad_norm": 9.080270767211914, |
| "learning_rate": 2.7497665732959853e-05, |
| "loss": 0.7331, |
| "num_input_tokens_seen": 75920, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 12.239612579345703, |
| "learning_rate": 2.773109243697479e-05, |
| "loss": 1.0243, |
| "num_input_tokens_seen": 76480, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.5602240896358543, |
| "grad_norm": 5.2204790115356445, |
| "learning_rate": 2.796451914098973e-05, |
| "loss": 0.7178, |
| "num_input_tokens_seen": 77248, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5648926237161531, |
| "grad_norm": 23.551700592041016, |
| "learning_rate": 2.819794584500467e-05, |
| "loss": 0.7556, |
| "num_input_tokens_seen": 77920, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.5695611577964519, |
| "grad_norm": 32.25157928466797, |
| "learning_rate": 2.8431372549019608e-05, |
| "loss": 0.8648, |
| "num_input_tokens_seen": 78560, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.5742296918767507, |
| "grad_norm": 29.887779235839844, |
| "learning_rate": 2.8664799253034546e-05, |
| "loss": 0.8888, |
| "num_input_tokens_seen": 79184, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.5788982259570495, |
| "grad_norm": 2.3339264392852783, |
| "learning_rate": 2.8898225957049485e-05, |
| "loss": 0.5831, |
| "num_input_tokens_seen": 79776, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.5835667600373483, |
| "grad_norm": 7.388897895812988, |
| "learning_rate": 2.913165266106443e-05, |
| "loss": 0.8904, |
| "num_input_tokens_seen": 80368, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 11.283287048339844, |
| "learning_rate": 2.9365079365079366e-05, |
| "loss": 0.6893, |
| "num_input_tokens_seen": 81040, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.5929038281979458, |
| "grad_norm": 80.2661361694336, |
| "learning_rate": 2.9598506069094305e-05, |
| "loss": 0.7345, |
| "num_input_tokens_seen": 81648, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.5975723622782446, |
| "grad_norm": 7.673534870147705, |
| "learning_rate": 2.9831932773109244e-05, |
| "loss": 0.8234, |
| "num_input_tokens_seen": 82208, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.6022408963585434, |
| "grad_norm": 8.291051864624023, |
| "learning_rate": 3.0065359477124182e-05, |
| "loss": 0.9744, |
| "num_input_tokens_seen": 82784, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.6069094304388422, |
| "grad_norm": 6.479700088500977, |
| "learning_rate": 3.029878618113912e-05, |
| "loss": 0.6968, |
| "num_input_tokens_seen": 83424, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.611577964519141, |
| "grad_norm": 13.389955520629883, |
| "learning_rate": 3.053221288515406e-05, |
| "loss": 0.3775, |
| "num_input_tokens_seen": 84112, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.6162464985994398, |
| "grad_norm": 9.457218170166016, |
| "learning_rate": 3.0765639589169e-05, |
| "loss": 0.919, |
| "num_input_tokens_seen": 84720, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.6209150326797386, |
| "grad_norm": 8.425762176513672, |
| "learning_rate": 3.099906629318394e-05, |
| "loss": 1.012, |
| "num_input_tokens_seen": 85264, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.6255835667600373, |
| "grad_norm": 8.807003021240234, |
| "learning_rate": 3.123249299719888e-05, |
| "loss": 0.6742, |
| "num_input_tokens_seen": 85840, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.6302521008403361, |
| "grad_norm": 5.953861713409424, |
| "learning_rate": 3.146591970121382e-05, |
| "loss": 0.6991, |
| "num_input_tokens_seen": 86464, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.6349206349206349, |
| "grad_norm": 6.615007400512695, |
| "learning_rate": 3.169934640522876e-05, |
| "loss": 0.5356, |
| "num_input_tokens_seen": 87104, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.6395891690009337, |
| "grad_norm": 9.70020580291748, |
| "learning_rate": 3.1932773109243696e-05, |
| "loss": 0.6897, |
| "num_input_tokens_seen": 87792, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.6442577030812325, |
| "grad_norm": 13.47252082824707, |
| "learning_rate": 3.216619981325864e-05, |
| "loss": 1.0909, |
| "num_input_tokens_seen": 88480, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.6489262371615313, |
| "grad_norm": 11.896513938903809, |
| "learning_rate": 3.239962651727358e-05, |
| "loss": 1.3468, |
| "num_input_tokens_seen": 89024, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.6535947712418301, |
| "grad_norm": 11.38027572631836, |
| "learning_rate": 3.263305322128852e-05, |
| "loss": 0.7407, |
| "num_input_tokens_seen": 89632, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.6582633053221288, |
| "grad_norm": 8.730727195739746, |
| "learning_rate": 3.286647992530346e-05, |
| "loss": 0.866, |
| "num_input_tokens_seen": 90256, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.6629318394024276, |
| "grad_norm": 13.638816833496094, |
| "learning_rate": 3.3099906629318396e-05, |
| "loss": 0.9243, |
| "num_input_tokens_seen": 90864, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.6676003734827264, |
| "grad_norm": 4.575711250305176, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.8537, |
| "num_input_tokens_seen": 91584, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.6722689075630253, |
| "grad_norm": 9.954397201538086, |
| "learning_rate": 3.3566760037348274e-05, |
| "loss": 0.5893, |
| "num_input_tokens_seen": 92192, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.676937441643324, |
| "grad_norm": 29.623050689697266, |
| "learning_rate": 3.380018674136321e-05, |
| "loss": 0.7864, |
| "num_input_tokens_seen": 92832, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.6816059757236228, |
| "grad_norm": 4.245872974395752, |
| "learning_rate": 3.403361344537815e-05, |
| "loss": 0.7591, |
| "num_input_tokens_seen": 93632, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.6862745098039216, |
| "grad_norm": 18.07567024230957, |
| "learning_rate": 3.426704014939309e-05, |
| "loss": 0.8031, |
| "num_input_tokens_seen": 94256, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.6909430438842203, |
| "grad_norm": 12.143633842468262, |
| "learning_rate": 3.450046685340803e-05, |
| "loss": 1.1466, |
| "num_input_tokens_seen": 94832, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.6956115779645191, |
| "grad_norm": 32.405723571777344, |
| "learning_rate": 3.473389355742297e-05, |
| "loss": 0.9448, |
| "num_input_tokens_seen": 95440, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.7002801120448179, |
| "grad_norm": 11.133152961730957, |
| "learning_rate": 3.4967320261437906e-05, |
| "loss": 0.6633, |
| "num_input_tokens_seen": 96208, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.7049486461251168, |
| "grad_norm": 5.159026145935059, |
| "learning_rate": 3.520074696545285e-05, |
| "loss": 0.747, |
| "num_input_tokens_seen": 96992, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.7096171802054155, |
| "grad_norm": 9.558073997497559, |
| "learning_rate": 3.543417366946779e-05, |
| "loss": 0.8285, |
| "num_input_tokens_seen": 97600, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 7.429594993591309, |
| "learning_rate": 3.566760037348273e-05, |
| "loss": 0.6289, |
| "num_input_tokens_seen": 98384, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.7189542483660131, |
| "grad_norm": 9.359416961669922, |
| "learning_rate": 3.590102707749767e-05, |
| "loss": 1.4207, |
| "num_input_tokens_seen": 99008, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.7236227824463118, |
| "grad_norm": 57.07271957397461, |
| "learning_rate": 3.613445378151261e-05, |
| "loss": 1.0578, |
| "num_input_tokens_seen": 99648, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.7282913165266106, |
| "grad_norm": 13.815699577331543, |
| "learning_rate": 3.6367880485527545e-05, |
| "loss": 0.9202, |
| "num_input_tokens_seen": 100368, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.7329598506069094, |
| "grad_norm": 7.66619873046875, |
| "learning_rate": 3.6601307189542484e-05, |
| "loss": 0.9465, |
| "num_input_tokens_seen": 101056, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.7376283846872083, |
| "grad_norm": 5.203883171081543, |
| "learning_rate": 3.683473389355743e-05, |
| "loss": 0.5311, |
| "num_input_tokens_seen": 101744, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.742296918767507, |
| "grad_norm": 19.66509437561035, |
| "learning_rate": 3.706816059757237e-05, |
| "loss": 1.1779, |
| "num_input_tokens_seen": 102400, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.7469654528478058, |
| "grad_norm": 10.233268737792969, |
| "learning_rate": 3.730158730158731e-05, |
| "loss": 0.7842, |
| "num_input_tokens_seen": 103120, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.7516339869281046, |
| "grad_norm": 4.705903053283691, |
| "learning_rate": 3.753501400560224e-05, |
| "loss": 0.6265, |
| "num_input_tokens_seen": 103776, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.7563025210084033, |
| "grad_norm": 6.988365173339844, |
| "learning_rate": 3.776844070961718e-05, |
| "loss": 0.7407, |
| "num_input_tokens_seen": 104432, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.7609710550887021, |
| "grad_norm": 9.381958961486816, |
| "learning_rate": 3.800186741363212e-05, |
| "loss": 0.5331, |
| "num_input_tokens_seen": 105136, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.765639589169001, |
| "grad_norm": 6.593667984008789, |
| "learning_rate": 3.8235294117647055e-05, |
| "loss": 0.6444, |
| "num_input_tokens_seen": 105760, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.7703081232492998, |
| "grad_norm": 6.7614569664001465, |
| "learning_rate": 3.8468720821662e-05, |
| "loss": 0.9035, |
| "num_input_tokens_seen": 106416, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.7749766573295985, |
| "grad_norm": 29.56807518005371, |
| "learning_rate": 3.870214752567694e-05, |
| "loss": 0.6458, |
| "num_input_tokens_seen": 107264, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.7796451914098973, |
| "grad_norm": 2.848146915435791, |
| "learning_rate": 3.893557422969188e-05, |
| "loss": 0.5388, |
| "num_input_tokens_seen": 107952, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.7843137254901961, |
| "grad_norm": 4.524416446685791, |
| "learning_rate": 3.916900093370682e-05, |
| "loss": 0.843, |
| "num_input_tokens_seen": 108560, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.7889822595704948, |
| "grad_norm": 13.986185073852539, |
| "learning_rate": 3.9402427637721756e-05, |
| "loss": 0.8335, |
| "num_input_tokens_seen": 109216, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.7936507936507936, |
| "grad_norm": 7.887524604797363, |
| "learning_rate": 3.9635854341736695e-05, |
| "loss": 1.1149, |
| "num_input_tokens_seen": 109824, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.7983193277310925, |
| "grad_norm": 10.166050910949707, |
| "learning_rate": 3.986928104575164e-05, |
| "loss": 0.9014, |
| "num_input_tokens_seen": 110496, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.8029878618113913, |
| "grad_norm": 3.036750555038452, |
| "learning_rate": 4.010270774976658e-05, |
| "loss": 0.4639, |
| "num_input_tokens_seen": 111248, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.80765639589169, |
| "grad_norm": 8.926546096801758, |
| "learning_rate": 4.033613445378152e-05, |
| "loss": 0.8944, |
| "num_input_tokens_seen": 111888, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.8123249299719888, |
| "grad_norm": 14.85969066619873, |
| "learning_rate": 4.0569561157796457e-05, |
| "loss": 0.6904, |
| "num_input_tokens_seen": 112496, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.8169934640522876, |
| "grad_norm": 8.692270278930664, |
| "learning_rate": 4.0802987861811395e-05, |
| "loss": 1.0703, |
| "num_input_tokens_seen": 113152, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.8216619981325863, |
| "grad_norm": 2.891441822052002, |
| "learning_rate": 4.1036414565826334e-05, |
| "loss": 0.6687, |
| "num_input_tokens_seen": 113856, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.8263305322128851, |
| "grad_norm": 9.000094413757324, |
| "learning_rate": 4.126984126984127e-05, |
| "loss": 0.7608, |
| "num_input_tokens_seen": 114512, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.830999066293184, |
| "grad_norm": 4.151757717132568, |
| "learning_rate": 4.150326797385621e-05, |
| "loss": 0.6469, |
| "num_input_tokens_seen": 115168, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.8356676003734828, |
| "grad_norm": 16.637250900268555, |
| "learning_rate": 4.173669467787115e-05, |
| "loss": 0.5817, |
| "num_input_tokens_seen": 115728, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.8403361344537815, |
| "grad_norm": 17.092973709106445, |
| "learning_rate": 4.197012138188609e-05, |
| "loss": 0.9298, |
| "num_input_tokens_seen": 116352, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.8450046685340803, |
| "grad_norm": 3.194045305252075, |
| "learning_rate": 4.220354808590103e-05, |
| "loss": 0.8504, |
| "num_input_tokens_seen": 117008, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.8496732026143791, |
| "grad_norm": 14.898492813110352, |
| "learning_rate": 4.2436974789915967e-05, |
| "loss": 0.5924, |
| "num_input_tokens_seen": 117776, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.8543417366946778, |
| "grad_norm": 12.749673843383789, |
| "learning_rate": 4.2670401493930905e-05, |
| "loss": 0.7349, |
| "num_input_tokens_seen": 118368, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.8590102707749766, |
| "grad_norm": 7.691690921783447, |
| "learning_rate": 4.2903828197945844e-05, |
| "loss": 0.7437, |
| "num_input_tokens_seen": 119024, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.8636788048552755, |
| "grad_norm": 7.8026442527771, |
| "learning_rate": 4.313725490196079e-05, |
| "loss": 0.5653, |
| "num_input_tokens_seen": 119632, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.8683473389355743, |
| "grad_norm": 2.568347692489624, |
| "learning_rate": 4.337068160597573e-05, |
| "loss": 0.9791, |
| "num_input_tokens_seen": 120240, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.873015873015873, |
| "grad_norm": 5.230515480041504, |
| "learning_rate": 4.360410830999067e-05, |
| "loss": 0.976, |
| "num_input_tokens_seen": 120832, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.8776844070961718, |
| "grad_norm": 14.007842063903809, |
| "learning_rate": 4.3837535014005606e-05, |
| "loss": 0.9204, |
| "num_input_tokens_seen": 121488, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.8823529411764706, |
| "grad_norm": 7.546886920928955, |
| "learning_rate": 4.4070961718020545e-05, |
| "loss": 0.6922, |
| "num_input_tokens_seen": 122112, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.8870214752567693, |
| "grad_norm": 6.620900630950928, |
| "learning_rate": 4.430438842203548e-05, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 122736, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.8916900093370682, |
| "grad_norm": 1.7955033779144287, |
| "learning_rate": 4.453781512605042e-05, |
| "loss": 1.0533, |
| "num_input_tokens_seen": 123328, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.896358543417367, |
| "grad_norm": 16.979063034057617, |
| "learning_rate": 4.477124183006536e-05, |
| "loss": 0.7929, |
| "num_input_tokens_seen": 124016, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.9010270774976658, |
| "grad_norm": 4.907767295837402, |
| "learning_rate": 4.50046685340803e-05, |
| "loss": 0.7584, |
| "num_input_tokens_seen": 124672, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.9056956115779645, |
| "grad_norm": 10.652020454406738, |
| "learning_rate": 4.523809523809524e-05, |
| "loss": 0.9878, |
| "num_input_tokens_seen": 125376, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.9103641456582633, |
| "grad_norm": 6.290508270263672, |
| "learning_rate": 4.547152194211018e-05, |
| "loss": 0.7448, |
| "num_input_tokens_seen": 126032, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.9150326797385621, |
| "grad_norm": 9.92817497253418, |
| "learning_rate": 4.5704948646125116e-05, |
| "loss": 0.8184, |
| "num_input_tokens_seen": 126640, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.9197012138188608, |
| "grad_norm": 3.918785333633423, |
| "learning_rate": 4.5938375350140055e-05, |
| "loss": 0.4832, |
| "num_input_tokens_seen": 127360, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.9243697478991597, |
| "grad_norm": 7.656343936920166, |
| "learning_rate": 4.6171802054155e-05, |
| "loss": 0.7947, |
| "num_input_tokens_seen": 128000, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.9290382819794585, |
| "grad_norm": 9.467264175415039, |
| "learning_rate": 4.640522875816994e-05, |
| "loss": 0.9679, |
| "num_input_tokens_seen": 128672, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.9337068160597572, |
| "grad_norm": 8.756389617919922, |
| "learning_rate": 4.663865546218488e-05, |
| "loss": 0.5004, |
| "num_input_tokens_seen": 129328, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.938375350140056, |
| "grad_norm": 5.837770938873291, |
| "learning_rate": 4.6872082166199816e-05, |
| "loss": 0.6132, |
| "num_input_tokens_seen": 129984, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.9430438842203548, |
| "grad_norm": 4.566342830657959, |
| "learning_rate": 4.7105508870214755e-05, |
| "loss": 0.6422, |
| "num_input_tokens_seen": 130720, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.9477124183006536, |
| "grad_norm": 8.053881645202637, |
| "learning_rate": 4.7338935574229694e-05, |
| "loss": 0.419, |
| "num_input_tokens_seen": 131392, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 12.437357902526855, |
| "learning_rate": 4.757236227824463e-05, |
| "loss": 0.9272, |
| "num_input_tokens_seen": 132064, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.9570494864612512, |
| "grad_norm": 5.731832504272461, |
| "learning_rate": 4.780578898225958e-05, |
| "loss": 0.4189, |
| "num_input_tokens_seen": 132704, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.96171802054155, |
| "grad_norm": 5.499335289001465, |
| "learning_rate": 4.803921568627452e-05, |
| "loss": 0.5612, |
| "num_input_tokens_seen": 133344, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.9663865546218487, |
| "grad_norm": 7.233240604400635, |
| "learning_rate": 4.827264239028945e-05, |
| "loss": 0.4785, |
| "num_input_tokens_seen": 133952, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.9710550887021475, |
| "grad_norm": 38.50244903564453, |
| "learning_rate": 4.850606909430439e-05, |
| "loss": 0.5721, |
| "num_input_tokens_seen": 134512, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.9757236227824463, |
| "grad_norm": 8.803295135498047, |
| "learning_rate": 4.8739495798319326e-05, |
| "loss": 0.5325, |
| "num_input_tokens_seen": 135136, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.9803921568627451, |
| "grad_norm": 6.8154826164245605, |
| "learning_rate": 4.8972922502334265e-05, |
| "loss": 0.7822, |
| "num_input_tokens_seen": 135728, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.9850606909430439, |
| "grad_norm": 10.711194038391113, |
| "learning_rate": 4.9206349206349204e-05, |
| "loss": 1.1347, |
| "num_input_tokens_seen": 136336, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.9897292250233427, |
| "grad_norm": 4.25268030166626, |
| "learning_rate": 4.943977591036415e-05, |
| "loss": 0.8395, |
| "num_input_tokens_seen": 136976, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.9943977591036415, |
| "grad_norm": 14.6632661819458, |
| "learning_rate": 4.967320261437909e-05, |
| "loss": 0.9144, |
| "num_input_tokens_seen": 137584, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.9990662931839402, |
| "grad_norm": 21.839466094970703, |
| "learning_rate": 4.990662931839403e-05, |
| "loss": 1.0361, |
| "num_input_tokens_seen": 138160, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.0009337068160598, |
| "eval_loss": 0.7260501980781555, |
| "eval_runtime": 3.8833, |
| "eval_samples_per_second": 61.288, |
| "eval_steps_per_second": 30.644, |
| "num_input_tokens_seen": 138320, |
| "step": 1072 |
| }, |
| { |
| "epoch": 1.003734827264239, |
| "grad_norm": 4.944736480712891, |
| "learning_rate": 4.999998804943956e-05, |
| "loss": 0.7482, |
| "num_input_tokens_seen": 138640, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.0084033613445378, |
| "grad_norm": 14.909157752990723, |
| "learning_rate": 4.999991501827824e-05, |
| "loss": 0.6911, |
| "num_input_tokens_seen": 139360, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.0130718954248366, |
| "grad_norm": 3.459869146347046, |
| "learning_rate": 4.999977559534957e-05, |
| "loss": 0.5468, |
| "num_input_tokens_seen": 139984, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.0177404295051353, |
| "grad_norm": 4.126331329345703, |
| "learning_rate": 4.9999569781023795e-05, |
| "loss": 0.9195, |
| "num_input_tokens_seen": 140592, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.022408963585434, |
| "grad_norm": 13.112894058227539, |
| "learning_rate": 4.99992975758475e-05, |
| "loss": 1.0261, |
| "num_input_tokens_seen": 141248, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.0270774976657329, |
| "grad_norm": 46.097816467285156, |
| "learning_rate": 4.999895898054357e-05, |
| "loss": 1.0014, |
| "num_input_tokens_seen": 141760, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.0317460317460316, |
| "grad_norm": 9.920419692993164, |
| "learning_rate": 4.999855399601122e-05, |
| "loss": 0.9958, |
| "num_input_tokens_seen": 142416, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.0364145658263306, |
| "grad_norm": 5.019301891326904, |
| "learning_rate": 4.999808262332595e-05, |
| "loss": 0.5371, |
| "num_input_tokens_seen": 143136, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.0410830999066294, |
| "grad_norm": 9.689529418945312, |
| "learning_rate": 4.9997544863739565e-05, |
| "loss": 0.7149, |
| "num_input_tokens_seen": 143776, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.0457516339869282, |
| "grad_norm": 13.740153312683105, |
| "learning_rate": 4.999694071868019e-05, |
| "loss": 0.6224, |
| "num_input_tokens_seen": 144448, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.050420168067227, |
| "grad_norm": 12.397239685058594, |
| "learning_rate": 4.999627018975226e-05, |
| "loss": 0.7541, |
| "num_input_tokens_seen": 145056, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.0550887021475257, |
| "grad_norm": 8.042997360229492, |
| "learning_rate": 4.999553327873645e-05, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 145728, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.0597572362278245, |
| "grad_norm": 4.10331392288208, |
| "learning_rate": 4.999472998758978e-05, |
| "loss": 0.5729, |
| "num_input_tokens_seen": 146400, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.0644257703081232, |
| "grad_norm": 11.920084953308105, |
| "learning_rate": 4.999386031844554e-05, |
| "loss": 1.217, |
| "num_input_tokens_seen": 146960, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.069094304388422, |
| "grad_norm": 7.711158275604248, |
| "learning_rate": 4.999292427361328e-05, |
| "loss": 0.7875, |
| "num_input_tokens_seen": 147584, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.0737628384687208, |
| "grad_norm": 5.348123073577881, |
| "learning_rate": 4.999192185557884e-05, |
| "loss": 0.6283, |
| "num_input_tokens_seen": 148224, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.0784313725490196, |
| "grad_norm": 7.719697952270508, |
| "learning_rate": 4.999085306700431e-05, |
| "loss": 1.2682, |
| "num_input_tokens_seen": 148816, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.0830999066293183, |
| "grad_norm": 2.1265199184417725, |
| "learning_rate": 4.998971791072807e-05, |
| "loss": 0.7327, |
| "num_input_tokens_seen": 149568, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.087768440709617, |
| "grad_norm": 12.70578670501709, |
| "learning_rate": 4.998851638976472e-05, |
| "loss": 0.5073, |
| "num_input_tokens_seen": 150192, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.092436974789916, |
| "grad_norm": 6.228635787963867, |
| "learning_rate": 4.9987248507305114e-05, |
| "loss": 0.7351, |
| "num_input_tokens_seen": 150848, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.0971055088702149, |
| "grad_norm": 9.919082641601562, |
| "learning_rate": 4.998591426671635e-05, |
| "loss": 0.4393, |
| "num_input_tokens_seen": 151440, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.1017740429505136, |
| "grad_norm": 2.2125799655914307, |
| "learning_rate": 4.998451367154173e-05, |
| "loss": 0.7022, |
| "num_input_tokens_seen": 152032, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.1064425770308124, |
| "grad_norm": 2.3273732662200928, |
| "learning_rate": 4.998304672550081e-05, |
| "loss": 0.5521, |
| "num_input_tokens_seen": 152688, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 8.704615592956543, |
| "learning_rate": 4.9981513432489295e-05, |
| "loss": 1.1852, |
| "num_input_tokens_seen": 153296, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.11577964519141, |
| "grad_norm": 6.8140645027160645, |
| "learning_rate": 4.9979913796579146e-05, |
| "loss": 0.5501, |
| "num_input_tokens_seen": 153936, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.1204481792717087, |
| "grad_norm": 6.674036026000977, |
| "learning_rate": 4.9978247822018476e-05, |
| "loss": 0.6271, |
| "num_input_tokens_seen": 154608, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.1251167133520075, |
| "grad_norm": 13.860675811767578, |
| "learning_rate": 4.997651551323158e-05, |
| "loss": 0.5194, |
| "num_input_tokens_seen": 155472, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.1297852474323062, |
| "grad_norm": 6.035935401916504, |
| "learning_rate": 4.997471687481892e-05, |
| "loss": 0.8798, |
| "num_input_tokens_seen": 156112, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.134453781512605, |
| "grad_norm": 5.109834671020508, |
| "learning_rate": 4.9972851911557095e-05, |
| "loss": 0.4041, |
| "num_input_tokens_seen": 156784, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.1391223155929038, |
| "grad_norm": 9.794246673583984, |
| "learning_rate": 4.997092062839885e-05, |
| "loss": 0.7887, |
| "num_input_tokens_seen": 157440, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.1437908496732025, |
| "grad_norm": 6.726025581359863, |
| "learning_rate": 4.996892303047306e-05, |
| "loss": 1.0743, |
| "num_input_tokens_seen": 157952, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.1484593837535013, |
| "grad_norm": 8.099092483520508, |
| "learning_rate": 4.996685912308471e-05, |
| "loss": 0.4954, |
| "num_input_tokens_seen": 158544, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.1531279178338, |
| "grad_norm": 5.0507893562316895, |
| "learning_rate": 4.9964728911714866e-05, |
| "loss": 0.6247, |
| "num_input_tokens_seen": 159232, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.1577964519140989, |
| "grad_norm": 3.901911497116089, |
| "learning_rate": 4.996253240202069e-05, |
| "loss": 0.4887, |
| "num_input_tokens_seen": 159824, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.1624649859943978, |
| "grad_norm": 9.426101684570312, |
| "learning_rate": 4.996026959983541e-05, |
| "loss": 0.6639, |
| "num_input_tokens_seen": 160544, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.1671335200746966, |
| "grad_norm": 3.252911329269409, |
| "learning_rate": 4.995794051116831e-05, |
| "loss": 0.551, |
| "num_input_tokens_seen": 161216, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.1718020541549954, |
| "grad_norm": 9.056844711303711, |
| "learning_rate": 4.99555451422047e-05, |
| "loss": 1.3325, |
| "num_input_tokens_seen": 161920, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 9.504724502563477, |
| "learning_rate": 4.99530834993059e-05, |
| "loss": 0.9963, |
| "num_input_tokens_seen": 162560, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.181139122315593, |
| "grad_norm": 4.32997989654541, |
| "learning_rate": 4.9950555589009255e-05, |
| "loss": 0.5409, |
| "num_input_tokens_seen": 163296, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.1858076563958917, |
| "grad_norm": 5.476011753082275, |
| "learning_rate": 4.994796141802809e-05, |
| "loss": 0.7473, |
| "num_input_tokens_seen": 163952, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 4.545778751373291, |
| "learning_rate": 4.994530099325169e-05, |
| "loss": 0.6341, |
| "num_input_tokens_seen": 164544, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.1951447245564892, |
| "grad_norm": 3.5914573669433594, |
| "learning_rate": 4.994257432174529e-05, |
| "loss": 0.4417, |
| "num_input_tokens_seen": 165216, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.199813258636788, |
| "grad_norm": 6.310541152954102, |
| "learning_rate": 4.9939781410750055e-05, |
| "loss": 0.699, |
| "num_input_tokens_seen": 165936, |
| "step": 1285 |
| }, |
| { |
| "epoch": 1.2044817927170868, |
| "grad_norm": 40.967552185058594, |
| "learning_rate": 4.993692226768306e-05, |
| "loss": 0.6085, |
| "num_input_tokens_seen": 166672, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.2091503267973855, |
| "grad_norm": 12.131630897521973, |
| "learning_rate": 4.993399690013727e-05, |
| "loss": 0.8548, |
| "num_input_tokens_seen": 167328, |
| "step": 1295 |
| }, |
| { |
| "epoch": 1.2138188608776843, |
| "grad_norm": 5.9977803230285645, |
| "learning_rate": 4.993100531588154e-05, |
| "loss": 0.4468, |
| "num_input_tokens_seen": 167920, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.2184873949579833, |
| "grad_norm": 6.294752597808838, |
| "learning_rate": 4.992794752286054e-05, |
| "loss": 0.5278, |
| "num_input_tokens_seen": 168496, |
| "step": 1305 |
| }, |
| { |
| "epoch": 1.223155929038282, |
| "grad_norm": 5.4937424659729, |
| "learning_rate": 4.99248235291948e-05, |
| "loss": 0.6388, |
| "num_input_tokens_seen": 169120, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.2278244631185808, |
| "grad_norm": 12.049308776855469, |
| "learning_rate": 4.9921633343180654e-05, |
| "loss": 0.4561, |
| "num_input_tokens_seen": 169808, |
| "step": 1315 |
| }, |
| { |
| "epoch": 1.2324929971988796, |
| "grad_norm": 2.764819860458374, |
| "learning_rate": 4.99183769732902e-05, |
| "loss": 0.6871, |
| "num_input_tokens_seen": 170432, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.2371615312791784, |
| "grad_norm": 4.406725883483887, |
| "learning_rate": 4.991505442817131e-05, |
| "loss": 0.5468, |
| "num_input_tokens_seen": 171152, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.2418300653594772, |
| "grad_norm": 3.939741611480713, |
| "learning_rate": 4.9911665716647624e-05, |
| "loss": 0.9366, |
| "num_input_tokens_seen": 171776, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.246498599439776, |
| "grad_norm": 3.4890053272247314, |
| "learning_rate": 4.990821084771845e-05, |
| "loss": 0.5881, |
| "num_input_tokens_seen": 172528, |
| "step": 1335 |
| }, |
| { |
| "epoch": 1.2511671335200747, |
| "grad_norm": 11.309175491333008, |
| "learning_rate": 4.990468983055883e-05, |
| "loss": 0.954, |
| "num_input_tokens_seen": 173200, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.2558356676003735, |
| "grad_norm": 1.9080860614776611, |
| "learning_rate": 4.990110267451944e-05, |
| "loss": 0.6237, |
| "num_input_tokens_seen": 173840, |
| "step": 1345 |
| }, |
| { |
| "epoch": 1.2605042016806722, |
| "grad_norm": 8.571795463562012, |
| "learning_rate": 4.989744938912663e-05, |
| "loss": 0.5249, |
| "num_input_tokens_seen": 174560, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.265172735760971, |
| "grad_norm": 7.339284896850586, |
| "learning_rate": 4.989372998408236e-05, |
| "loss": 1.1344, |
| "num_input_tokens_seen": 175248, |
| "step": 1355 |
| }, |
| { |
| "epoch": 1.2698412698412698, |
| "grad_norm": 4.928005695343018, |
| "learning_rate": 4.9889944469264166e-05, |
| "loss": 0.8068, |
| "num_input_tokens_seen": 175888, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.2745098039215685, |
| "grad_norm": 6.07223653793335, |
| "learning_rate": 4.988609285472517e-05, |
| "loss": 0.7017, |
| "num_input_tokens_seen": 176560, |
| "step": 1365 |
| }, |
| { |
| "epoch": 1.2791783380018673, |
| "grad_norm": 6.8295698165893555, |
| "learning_rate": 4.988217515069403e-05, |
| "loss": 0.9394, |
| "num_input_tokens_seen": 177136, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.283846872082166, |
| "grad_norm": 11.6112642288208, |
| "learning_rate": 4.98781913675749e-05, |
| "loss": 0.6165, |
| "num_input_tokens_seen": 177728, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.2885154061624648, |
| "grad_norm": 4.467996597290039, |
| "learning_rate": 4.9874141515947456e-05, |
| "loss": 0.3883, |
| "num_input_tokens_seen": 178352, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.2931839402427638, |
| "grad_norm": 3.5836374759674072, |
| "learning_rate": 4.987002560656678e-05, |
| "loss": 0.6349, |
| "num_input_tokens_seen": 178928, |
| "step": 1385 |
| }, |
| { |
| "epoch": 1.2978524743230626, |
| "grad_norm": 13.943968772888184, |
| "learning_rate": 4.986584365036343e-05, |
| "loss": 0.5884, |
| "num_input_tokens_seen": 179568, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.3025210084033614, |
| "grad_norm": 10.653841972351074, |
| "learning_rate": 4.986159565844333e-05, |
| "loss": 0.8191, |
| "num_input_tokens_seen": 180272, |
| "step": 1395 |
| }, |
| { |
| "epoch": 1.3071895424836601, |
| "grad_norm": 6.080850124359131, |
| "learning_rate": 4.9857281642087785e-05, |
| "loss": 0.7922, |
| "num_input_tokens_seen": 180912, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.311858076563959, |
| "grad_norm": 2.8319334983825684, |
| "learning_rate": 4.985290161275345e-05, |
| "loss": 0.769, |
| "num_input_tokens_seen": 181536, |
| "step": 1405 |
| }, |
| { |
| "epoch": 1.3165266106442577, |
| "grad_norm": 3.922654867172241, |
| "learning_rate": 4.9848455582072265e-05, |
| "loss": 0.4264, |
| "num_input_tokens_seen": 182128, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.3211951447245565, |
| "grad_norm": 2.9652132987976074, |
| "learning_rate": 4.984394356185148e-05, |
| "loss": 0.5366, |
| "num_input_tokens_seen": 182816, |
| "step": 1415 |
| }, |
| { |
| "epoch": 1.3258636788048552, |
| "grad_norm": 7.564930438995361, |
| "learning_rate": 4.983936556407357e-05, |
| "loss": 0.6557, |
| "num_input_tokens_seen": 183408, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.330532212885154, |
| "grad_norm": 3.050708770751953, |
| "learning_rate": 4.983472160089623e-05, |
| "loss": 0.7718, |
| "num_input_tokens_seen": 184064, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.3352007469654528, |
| "grad_norm": 8.317185401916504, |
| "learning_rate": 4.983001168465234e-05, |
| "loss": 0.547, |
| "num_input_tokens_seen": 184704, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.3398692810457518, |
| "grad_norm": 3.655439853668213, |
| "learning_rate": 4.982523582784992e-05, |
| "loss": 0.6278, |
| "num_input_tokens_seen": 185280, |
| "step": 1435 |
| }, |
| { |
| "epoch": 1.3445378151260505, |
| "grad_norm": 8.60561752319336, |
| "learning_rate": 4.9820394043172136e-05, |
| "loss": 0.7132, |
| "num_input_tokens_seen": 185872, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.3492063492063493, |
| "grad_norm": 8.540854454040527, |
| "learning_rate": 4.98154863434772e-05, |
| "loss": 0.7167, |
| "num_input_tokens_seen": 186432, |
| "step": 1445 |
| }, |
| { |
| "epoch": 1.353874883286648, |
| "grad_norm": 11.289070129394531, |
| "learning_rate": 4.98105127417984e-05, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 187120, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.3585434173669468, |
| "grad_norm": 8.960957527160645, |
| "learning_rate": 4.980547325134401e-05, |
| "loss": 1.1768, |
| "num_input_tokens_seen": 187648, |
| "step": 1455 |
| }, |
| { |
| "epoch": 1.3632119514472456, |
| "grad_norm": 9.159289360046387, |
| "learning_rate": 4.980036788549733e-05, |
| "loss": 0.6238, |
| "num_input_tokens_seen": 188288, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.3678804855275444, |
| "grad_norm": 9.342123985290527, |
| "learning_rate": 4.9795196657816564e-05, |
| "loss": 0.6563, |
| "num_input_tokens_seen": 188880, |
| "step": 1465 |
| }, |
| { |
| "epoch": 1.3725490196078431, |
| "grad_norm": 5.995706081390381, |
| "learning_rate": 4.978995958203484e-05, |
| "loss": 0.6132, |
| "num_input_tokens_seen": 189488, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.377217553688142, |
| "grad_norm": 14.103466987609863, |
| "learning_rate": 4.978465667206015e-05, |
| "loss": 0.6597, |
| "num_input_tokens_seen": 190112, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.3818860877684407, |
| "grad_norm": 4.49061918258667, |
| "learning_rate": 4.977928794197532e-05, |
| "loss": 0.8417, |
| "num_input_tokens_seen": 190816, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.3865546218487395, |
| "grad_norm": 5.403022766113281, |
| "learning_rate": 4.977385340603798e-05, |
| "loss": 0.4915, |
| "num_input_tokens_seen": 191456, |
| "step": 1485 |
| }, |
| { |
| "epoch": 1.3912231559290382, |
| "grad_norm": 7.392147541046143, |
| "learning_rate": 4.976835307868053e-05, |
| "loss": 1.1056, |
| "num_input_tokens_seen": 192192, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.395891690009337, |
| "grad_norm": 2.5143346786499023, |
| "learning_rate": 4.976278697451006e-05, |
| "loss": 0.4763, |
| "num_input_tokens_seen": 192832, |
| "step": 1495 |
| }, |
| { |
| "epoch": 1.4005602240896358, |
| "grad_norm": 5.508999824523926, |
| "learning_rate": 4.975715510830837e-05, |
| "loss": 0.7842, |
| "num_input_tokens_seen": 193504, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.4052287581699345, |
| "grad_norm": 12.90170955657959, |
| "learning_rate": 4.9751457495031894e-05, |
| "loss": 0.7676, |
| "num_input_tokens_seen": 194112, |
| "step": 1505 |
| }, |
| { |
| "epoch": 1.4098972922502333, |
| "grad_norm": 4.344299793243408, |
| "learning_rate": 4.974569414981166e-05, |
| "loss": 0.9106, |
| "num_input_tokens_seen": 194784, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.4145658263305323, |
| "grad_norm": 3.9784064292907715, |
| "learning_rate": 4.973986508795327e-05, |
| "loss": 0.4855, |
| "num_input_tokens_seen": 195456, |
| "step": 1515 |
| }, |
| { |
| "epoch": 1.419234360410831, |
| "grad_norm": 8.706938743591309, |
| "learning_rate": 4.9733970324936855e-05, |
| "loss": 0.6186, |
| "num_input_tokens_seen": 196064, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.4239028944911298, |
| "grad_norm": 21.58633804321289, |
| "learning_rate": 4.9728009876416995e-05, |
| "loss": 0.7339, |
| "num_input_tokens_seen": 196688, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 3.153777599334717, |
| "learning_rate": 4.972198375822276e-05, |
| "loss": 0.9472, |
| "num_input_tokens_seen": 197360, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.4332399626517274, |
| "grad_norm": 5.13116455078125, |
| "learning_rate": 4.9715891986357566e-05, |
| "loss": 0.8951, |
| "num_input_tokens_seen": 198032, |
| "step": 1535 |
| }, |
| { |
| "epoch": 1.4379084967320261, |
| "grad_norm": 0.6041532754898071, |
| "learning_rate": 4.9709734576999226e-05, |
| "loss": 0.4999, |
| "num_input_tokens_seen": 198624, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.442577030812325, |
| "grad_norm": 7.3027143478393555, |
| "learning_rate": 4.9703511546499836e-05, |
| "loss": 0.7394, |
| "num_input_tokens_seen": 199312, |
| "step": 1545 |
| }, |
| { |
| "epoch": 1.4472455648926237, |
| "grad_norm": 9.439616203308105, |
| "learning_rate": 4.969722291138578e-05, |
| "loss": 0.9974, |
| "num_input_tokens_seen": 199968, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.4519140989729225, |
| "grad_norm": 12.853424072265625, |
| "learning_rate": 4.969086868835765e-05, |
| "loss": 0.5288, |
| "num_input_tokens_seen": 200800, |
| "step": 1555 |
| }, |
| { |
| "epoch": 1.4565826330532212, |
| "grad_norm": 1.419742226600647, |
| "learning_rate": 4.9684448894290236e-05, |
| "loss": 0.4857, |
| "num_input_tokens_seen": 201488, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.4612511671335202, |
| "grad_norm": 3.2149834632873535, |
| "learning_rate": 4.9677963546232445e-05, |
| "loss": 0.7109, |
| "num_input_tokens_seen": 202080, |
| "step": 1565 |
| }, |
| { |
| "epoch": 1.465919701213819, |
| "grad_norm": 3.134138584136963, |
| "learning_rate": 4.9671412661407296e-05, |
| "loss": 0.4495, |
| "num_input_tokens_seen": 202656, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 2.583625555038452, |
| "learning_rate": 4.966479625721183e-05, |
| "loss": 0.6337, |
| "num_input_tokens_seen": 203376, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.4752567693744165, |
| "grad_norm": 5.287656307220459, |
| "learning_rate": 4.9658114351217105e-05, |
| "loss": 0.5663, |
| "num_input_tokens_seen": 204016, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.4799253034547153, |
| "grad_norm": 7.850647926330566, |
| "learning_rate": 4.965136696116812e-05, |
| "loss": 0.8005, |
| "num_input_tokens_seen": 204640, |
| "step": 1585 |
| }, |
| { |
| "epoch": 1.484593837535014, |
| "grad_norm": 2.4759323596954346, |
| "learning_rate": 4.964455410498378e-05, |
| "loss": 0.4219, |
| "num_input_tokens_seen": 205264, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.4892623716153128, |
| "grad_norm": 1.871109962463379, |
| "learning_rate": 4.963767580075685e-05, |
| "loss": 0.506, |
| "num_input_tokens_seen": 205984, |
| "step": 1595 |
| }, |
| { |
| "epoch": 1.4939309056956116, |
| "grad_norm": 2.148587703704834, |
| "learning_rate": 4.9630732066753914e-05, |
| "loss": 0.7896, |
| "num_input_tokens_seen": 206672, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.4985994397759104, |
| "grad_norm": 8.374788284301758, |
| "learning_rate": 4.962372292141529e-05, |
| "loss": 0.6059, |
| "num_input_tokens_seen": 207360, |
| "step": 1605 |
| }, |
| { |
| "epoch": 1.5014005602240896, |
| "eval_loss": 0.6730406880378723, |
| "eval_runtime": 3.8743, |
| "eval_samples_per_second": 61.431, |
| "eval_steps_per_second": 30.716, |
| "num_input_tokens_seen": 207744, |
| "step": 1608 |
| }, |
| { |
| "epoch": 1.5032679738562091, |
| "grad_norm": 4.593263149261475, |
| "learning_rate": 4.9616648383355037e-05, |
| "loss": 0.5781, |
| "num_input_tokens_seen": 207936, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.507936507936508, |
| "grad_norm": 3.9901912212371826, |
| "learning_rate": 4.960950847136085e-05, |
| "loss": 0.6448, |
| "num_input_tokens_seen": 208528, |
| "step": 1615 |
| }, |
| { |
| "epoch": 1.5126050420168067, |
| "grad_norm": 12.342235565185547, |
| "learning_rate": 4.9602303204394044e-05, |
| "loss": 0.6921, |
| "num_input_tokens_seen": 209216, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.5172735760971054, |
| "grad_norm": 19.401485443115234, |
| "learning_rate": 4.9595032601589514e-05, |
| "loss": 0.4746, |
| "num_input_tokens_seen": 209856, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.5219421101774042, |
| "grad_norm": 5.1063971519470215, |
| "learning_rate": 4.958769668225565e-05, |
| "loss": 0.5231, |
| "num_input_tokens_seen": 210480, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.526610644257703, |
| "grad_norm": 3.45485258102417, |
| "learning_rate": 4.9580295465874304e-05, |
| "loss": 0.503, |
| "num_input_tokens_seen": 211152, |
| "step": 1635 |
| }, |
| { |
| "epoch": 1.5312791783380018, |
| "grad_norm": 8.241432189941406, |
| "learning_rate": 4.9572828972100734e-05, |
| "loss": 0.5253, |
| "num_input_tokens_seen": 211760, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.5359477124183005, |
| "grad_norm": 4.817863464355469, |
| "learning_rate": 4.956529722076355e-05, |
| "loss": 0.5521, |
| "num_input_tokens_seen": 212400, |
| "step": 1645 |
| }, |
| { |
| "epoch": 1.5406162464985993, |
| "grad_norm": 3.3362035751342773, |
| "learning_rate": 4.955770023186469e-05, |
| "loss": 0.4619, |
| "num_input_tokens_seen": 213040, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.545284780578898, |
| "grad_norm": 10.945075035095215, |
| "learning_rate": 4.9550038025579306e-05, |
| "loss": 1.012, |
| "num_input_tokens_seen": 213696, |
| "step": 1655 |
| }, |
| { |
| "epoch": 1.549953314659197, |
| "grad_norm": 10.770771980285645, |
| "learning_rate": 4.954231062225576e-05, |
| "loss": 0.4708, |
| "num_input_tokens_seen": 214336, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.5546218487394958, |
| "grad_norm": 1.7344568967819214, |
| "learning_rate": 4.9534518042415575e-05, |
| "loss": 0.3593, |
| "num_input_tokens_seen": 215040, |
| "step": 1665 |
| }, |
| { |
| "epoch": 1.5592903828197946, |
| "grad_norm": 7.169672012329102, |
| "learning_rate": 4.9526660306753346e-05, |
| "loss": 0.7011, |
| "num_input_tokens_seen": 215696, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.5639589169000934, |
| "grad_norm": 5.537058353424072, |
| "learning_rate": 4.95187374361367e-05, |
| "loss": 0.4733, |
| "num_input_tokens_seen": 216352, |
| "step": 1675 |
| }, |
| { |
| "epoch": 1.5686274509803921, |
| "grad_norm": 4.959131240844727, |
| "learning_rate": 4.951074945160623e-05, |
| "loss": 0.4652, |
| "num_input_tokens_seen": 216928, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.573295985060691, |
| "grad_norm": 13.092927932739258, |
| "learning_rate": 4.950269637437548e-05, |
| "loss": 0.701, |
| "num_input_tokens_seen": 217552, |
| "step": 1685 |
| }, |
| { |
| "epoch": 1.5779645191409897, |
| "grad_norm": 18.209341049194336, |
| "learning_rate": 4.949457822583085e-05, |
| "loss": 0.5557, |
| "num_input_tokens_seen": 218208, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.5826330532212887, |
| "grad_norm": 2.769484758377075, |
| "learning_rate": 4.9486395027531526e-05, |
| "loss": 0.5706, |
| "num_input_tokens_seen": 218784, |
| "step": 1695 |
| }, |
| { |
| "epoch": 1.5873015873015874, |
| "grad_norm": 5.670962810516357, |
| "learning_rate": 4.947814680120947e-05, |
| "loss": 0.7001, |
| "num_input_tokens_seen": 219424, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.5919701213818862, |
| "grad_norm": 8.110953330993652, |
| "learning_rate": 4.946983356876932e-05, |
| "loss": 0.739, |
| "num_input_tokens_seen": 220032, |
| "step": 1705 |
| }, |
| { |
| "epoch": 1.596638655462185, |
| "grad_norm": 1.141108512878418, |
| "learning_rate": 4.946145535228837e-05, |
| "loss": 0.4477, |
| "num_input_tokens_seen": 220752, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.6013071895424837, |
| "grad_norm": 6.900874137878418, |
| "learning_rate": 4.945301217401648e-05, |
| "loss": 0.6388, |
| "num_input_tokens_seen": 221376, |
| "step": 1715 |
| }, |
| { |
| "epoch": 1.6059757236227825, |
| "grad_norm": 5.326786518096924, |
| "learning_rate": 4.944450405637602e-05, |
| "loss": 0.5577, |
| "num_input_tokens_seen": 222032, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.6106442577030813, |
| "grad_norm": 15.054530143737793, |
| "learning_rate": 4.943593102196183e-05, |
| "loss": 0.8117, |
| "num_input_tokens_seen": 222608, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.61531279178338, |
| "grad_norm": 6.387323379516602, |
| "learning_rate": 4.942729309354115e-05, |
| "loss": 0.5248, |
| "num_input_tokens_seen": 223264, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.6199813258636788, |
| "grad_norm": 9.268798828125, |
| "learning_rate": 4.941859029405353e-05, |
| "loss": 0.6716, |
| "num_input_tokens_seen": 223904, |
| "step": 1735 |
| }, |
| { |
| "epoch": 1.6246498599439776, |
| "grad_norm": 10.223531723022461, |
| "learning_rate": 4.940982264661084e-05, |
| "loss": 0.8909, |
| "num_input_tokens_seen": 224496, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.6293183940242764, |
| "grad_norm": 5.692781925201416, |
| "learning_rate": 4.940099017449714e-05, |
| "loss": 0.7171, |
| "num_input_tokens_seen": 225232, |
| "step": 1745 |
| }, |
| { |
| "epoch": 1.6339869281045751, |
| "grad_norm": 4.1930084228515625, |
| "learning_rate": 4.9392092901168635e-05, |
| "loss": 0.4088, |
| "num_input_tokens_seen": 225872, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.638655462184874, |
| "grad_norm": 5.081546783447266, |
| "learning_rate": 4.9383130850253645e-05, |
| "loss": 0.6362, |
| "num_input_tokens_seen": 226448, |
| "step": 1755 |
| }, |
| { |
| "epoch": 1.6433239962651727, |
| "grad_norm": 4.499953269958496, |
| "learning_rate": 4.937410404555251e-05, |
| "loss": 0.658, |
| "num_input_tokens_seen": 227040, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.6479925303454714, |
| "grad_norm": 2.329761028289795, |
| "learning_rate": 4.9365012511037514e-05, |
| "loss": 0.5597, |
| "num_input_tokens_seen": 227664, |
| "step": 1765 |
| }, |
| { |
| "epoch": 1.6526610644257702, |
| "grad_norm": 2.832841396331787, |
| "learning_rate": 4.9355856270852865e-05, |
| "loss": 0.6842, |
| "num_input_tokens_seen": 228416, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.657329598506069, |
| "grad_norm": 2.6941893100738525, |
| "learning_rate": 4.934663534931462e-05, |
| "loss": 0.3806, |
| "num_input_tokens_seen": 229120, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.6619981325863677, |
| "grad_norm": 5.795032024383545, |
| "learning_rate": 4.933734977091059e-05, |
| "loss": 0.8215, |
| "num_input_tokens_seen": 229792, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 2.9985203742980957, |
| "learning_rate": 4.9327999560300284e-05, |
| "loss": 0.6302, |
| "num_input_tokens_seen": 230432, |
| "step": 1785 |
| }, |
| { |
| "epoch": 1.6713352007469653, |
| "grad_norm": 2.222291946411133, |
| "learning_rate": 4.931858474231488e-05, |
| "loss": 0.4667, |
| "num_input_tokens_seen": 231040, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.6760037348272643, |
| "grad_norm": 7.770413398742676, |
| "learning_rate": 4.930910534195712e-05, |
| "loss": 0.5139, |
| "num_input_tokens_seen": 231904, |
| "step": 1795 |
| }, |
| { |
| "epoch": 1.680672268907563, |
| "grad_norm": 4.517005443572998, |
| "learning_rate": 4.9299561384401236e-05, |
| "loss": 1.2908, |
| "num_input_tokens_seen": 232480, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.6853408029878618, |
| "grad_norm": 7.685630798339844, |
| "learning_rate": 4.928995289499294e-05, |
| "loss": 0.611, |
| "num_input_tokens_seen": 233072, |
| "step": 1805 |
| }, |
| { |
| "epoch": 1.6900093370681606, |
| "grad_norm": 9.455100059509277, |
| "learning_rate": 4.92802798992493e-05, |
| "loss": 0.5963, |
| "num_input_tokens_seen": 233712, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.6946778711484594, |
| "grad_norm": 7.285452842712402, |
| "learning_rate": 4.92705424228587e-05, |
| "loss": 0.5151, |
| "num_input_tokens_seen": 234384, |
| "step": 1815 |
| }, |
| { |
| "epoch": 1.6993464052287581, |
| "grad_norm": 2.7807321548461914, |
| "learning_rate": 4.926074049168074e-05, |
| "loss": 0.4471, |
| "num_input_tokens_seen": 234976, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.7040149393090571, |
| "grad_norm": 4.179620265960693, |
| "learning_rate": 4.9250874131746226e-05, |
| "loss": 0.439, |
| "num_input_tokens_seen": 235600, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.708683473389356, |
| "grad_norm": 8.957188606262207, |
| "learning_rate": 4.924094336925704e-05, |
| "loss": 0.755, |
| "num_input_tokens_seen": 236256, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.7133520074696547, |
| "grad_norm": 2.2546701431274414, |
| "learning_rate": 4.923094823058612e-05, |
| "loss": 0.6398, |
| "num_input_tokens_seen": 236896, |
| "step": 1835 |
| }, |
| { |
| "epoch": 1.7180205415499534, |
| "grad_norm": 3.878314971923828, |
| "learning_rate": 4.9220888742277336e-05, |
| "loss": 0.957, |
| "num_input_tokens_seen": 237552, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.7226890756302522, |
| "grad_norm": 4.878526210784912, |
| "learning_rate": 4.921076493104549e-05, |
| "loss": 0.6211, |
| "num_input_tokens_seen": 238224, |
| "step": 1845 |
| }, |
| { |
| "epoch": 1.727357609710551, |
| "grad_norm": 10.116549491882324, |
| "learning_rate": 4.920057682377616e-05, |
| "loss": 0.5877, |
| "num_input_tokens_seen": 238912, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.7320261437908497, |
| "grad_norm": 1.715928077697754, |
| "learning_rate": 4.9190324447525705e-05, |
| "loss": 0.4448, |
| "num_input_tokens_seen": 239616, |
| "step": 1855 |
| }, |
| { |
| "epoch": 1.7366946778711485, |
| "grad_norm": 5.732523441314697, |
| "learning_rate": 4.918000782952114e-05, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 240304, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.7413632119514473, |
| "grad_norm": 4.6649322509765625, |
| "learning_rate": 4.916962699716013e-05, |
| "loss": 0.9146, |
| "num_input_tokens_seen": 240992, |
| "step": 1865 |
| }, |
| { |
| "epoch": 1.746031746031746, |
| "grad_norm": 5.096378326416016, |
| "learning_rate": 4.9159181978010814e-05, |
| "loss": 0.8553, |
| "num_input_tokens_seen": 241648, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.7507002801120448, |
| "grad_norm": 3.9121956825256348, |
| "learning_rate": 4.9148672799811825e-05, |
| "loss": 0.3869, |
| "num_input_tokens_seen": 242320, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.7553688141923436, |
| "grad_norm": 3.4306793212890625, |
| "learning_rate": 4.9138099490472165e-05, |
| "loss": 1.2696, |
| "num_input_tokens_seen": 242912, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.7600373482726424, |
| "grad_norm": 1.4592175483703613, |
| "learning_rate": 4.912746207807117e-05, |
| "loss": 0.3769, |
| "num_input_tokens_seen": 243616, |
| "step": 1885 |
| }, |
| { |
| "epoch": 1.7647058823529411, |
| "grad_norm": 2.9718809127807617, |
| "learning_rate": 4.9116760590858404e-05, |
| "loss": 0.4158, |
| "num_input_tokens_seen": 244224, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.76937441643324, |
| "grad_norm": 5.453405857086182, |
| "learning_rate": 4.9105995057253586e-05, |
| "loss": 0.5839, |
| "num_input_tokens_seen": 244832, |
| "step": 1895 |
| }, |
| { |
| "epoch": 1.7740429505135387, |
| "grad_norm": 7.693014621734619, |
| "learning_rate": 4.9095165505846505e-05, |
| "loss": 0.6206, |
| "num_input_tokens_seen": 245520, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.7787114845938374, |
| "grad_norm": 15.895974159240723, |
| "learning_rate": 4.9084271965397014e-05, |
| "loss": 0.7017, |
| "num_input_tokens_seen": 246176, |
| "step": 1905 |
| }, |
| { |
| "epoch": 1.7833800186741362, |
| "grad_norm": 3.057833433151245, |
| "learning_rate": 4.9073314464834844e-05, |
| "loss": 0.4313, |
| "num_input_tokens_seen": 246848, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.788048552754435, |
| "grad_norm": 17.901315689086914, |
| "learning_rate": 4.906229303325961e-05, |
| "loss": 0.6497, |
| "num_input_tokens_seen": 247472, |
| "step": 1915 |
| }, |
| { |
| "epoch": 1.7927170868347337, |
| "grad_norm": 4.0249810218811035, |
| "learning_rate": 4.905120769994072e-05, |
| "loss": 0.6531, |
| "num_input_tokens_seen": 248128, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.7973856209150327, |
| "grad_norm": 0.5758776068687439, |
| "learning_rate": 4.9040058494317244e-05, |
| "loss": 0.3771, |
| "num_input_tokens_seen": 248864, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.8020541549953315, |
| "grad_norm": 10.201847076416016, |
| "learning_rate": 4.902884544599792e-05, |
| "loss": 0.702, |
| "num_input_tokens_seen": 249616, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.8067226890756303, |
| "grad_norm": 4.930323600769043, |
| "learning_rate": 4.901756858476101e-05, |
| "loss": 0.7461, |
| "num_input_tokens_seen": 250256, |
| "step": 1935 |
| }, |
| { |
| "epoch": 1.811391223155929, |
| "grad_norm": 4.371157646179199, |
| "learning_rate": 4.900622794055424e-05, |
| "loss": 0.8322, |
| "num_input_tokens_seen": 250864, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.8160597572362278, |
| "grad_norm": 3.946460485458374, |
| "learning_rate": 4.899482354349473e-05, |
| "loss": 0.8725, |
| "num_input_tokens_seen": 251568, |
| "step": 1945 |
| }, |
| { |
| "epoch": 1.8207282913165266, |
| "grad_norm": 9.607342720031738, |
| "learning_rate": 4.8983355423868913e-05, |
| "loss": 1.0061, |
| "num_input_tokens_seen": 252192, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.8253968253968254, |
| "grad_norm": 3.2483971118927, |
| "learning_rate": 4.8971823612132436e-05, |
| "loss": 0.637, |
| "num_input_tokens_seen": 252832, |
| "step": 1955 |
| }, |
| { |
| "epoch": 1.8300653594771243, |
| "grad_norm": 3.698613166809082, |
| "learning_rate": 4.8960228138910106e-05, |
| "loss": 0.4496, |
| "num_input_tokens_seen": 253488, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.8347338935574231, |
| "grad_norm": 7.103636741638184, |
| "learning_rate": 4.8948569034995765e-05, |
| "loss": 0.6317, |
| "num_input_tokens_seen": 254176, |
| "step": 1965 |
| }, |
| { |
| "epoch": 1.8394024276377219, |
| "grad_norm": 8.10537052154541, |
| "learning_rate": 4.8936846331352284e-05, |
| "loss": 0.8247, |
| "num_input_tokens_seen": 254752, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.8440709617180207, |
| "grad_norm": 5.597965717315674, |
| "learning_rate": 4.8925060059111394e-05, |
| "loss": 0.7657, |
| "num_input_tokens_seen": 255328, |
| "step": 1975 |
| }, |
| { |
| "epoch": 1.8487394957983194, |
| "grad_norm": 3.3380625247955322, |
| "learning_rate": 4.891321024957366e-05, |
| "loss": 0.6026, |
| "num_input_tokens_seen": 255936, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.8534080298786182, |
| "grad_norm": 3.575791835784912, |
| "learning_rate": 4.890129693420839e-05, |
| "loss": 0.7018, |
| "num_input_tokens_seen": 256608, |
| "step": 1985 |
| }, |
| { |
| "epoch": 1.858076563958917, |
| "grad_norm": 5.416692733764648, |
| "learning_rate": 4.888932014465352e-05, |
| "loss": 0.4166, |
| "num_input_tokens_seen": 257216, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.8627450980392157, |
| "grad_norm": 9.988777160644531, |
| "learning_rate": 4.887727991271558e-05, |
| "loss": 0.6218, |
| "num_input_tokens_seen": 257952, |
| "step": 1995 |
| }, |
| { |
| "epoch": 1.8674136321195145, |
| "grad_norm": 6.201949596405029, |
| "learning_rate": 4.8865176270369565e-05, |
| "loss": 0.8124, |
| "num_input_tokens_seen": 258608, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.8720821661998133, |
| "grad_norm": 4.34901237487793, |
| "learning_rate": 4.885300924975887e-05, |
| "loss": 0.418, |
| "num_input_tokens_seen": 259264, |
| "step": 2005 |
| }, |
| { |
| "epoch": 1.876750700280112, |
| "grad_norm": 7.196844577789307, |
| "learning_rate": 4.884077888319522e-05, |
| "loss": 0.7026, |
| "num_input_tokens_seen": 259936, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.8814192343604108, |
| "grad_norm": 4.676304340362549, |
| "learning_rate": 4.882848520315852e-05, |
| "loss": 0.7609, |
| "num_input_tokens_seen": 260560, |
| "step": 2015 |
| }, |
| { |
| "epoch": 1.8860877684407096, |
| "grad_norm": 6.854541301727295, |
| "learning_rate": 4.8816128242296876e-05, |
| "loss": 0.495, |
| "num_input_tokens_seen": 261184, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.8907563025210083, |
| "grad_norm": 3.158414840698242, |
| "learning_rate": 4.8803708033426404e-05, |
| "loss": 0.5712, |
| "num_input_tokens_seen": 261808, |
| "step": 2025 |
| }, |
| { |
| "epoch": 1.8954248366013071, |
| "grad_norm": 11.574291229248047, |
| "learning_rate": 4.8791224609531204e-05, |
| "loss": 0.4681, |
| "num_input_tokens_seen": 262384, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.9000933706816059, |
| "grad_norm": 3.723761558532715, |
| "learning_rate": 4.877867800376325e-05, |
| "loss": 0.5733, |
| "num_input_tokens_seen": 263008, |
| "step": 2035 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 3.9397077560424805, |
| "learning_rate": 4.8766068249442326e-05, |
| "loss": 1.066, |
| "num_input_tokens_seen": 263664, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.9094304388422034, |
| "grad_norm": 4.511308670043945, |
| "learning_rate": 4.875339538005588e-05, |
| "loss": 1.0934, |
| "num_input_tokens_seen": 264320, |
| "step": 2045 |
| }, |
| { |
| "epoch": 1.9140989729225022, |
| "grad_norm": 7.311952590942383, |
| "learning_rate": 4.874065942925899e-05, |
| "loss": 0.4114, |
| "num_input_tokens_seen": 265072, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.918767507002801, |
| "grad_norm": 14.041531562805176, |
| "learning_rate": 4.8727860430874285e-05, |
| "loss": 0.5461, |
| "num_input_tokens_seen": 265680, |
| "step": 2055 |
| }, |
| { |
| "epoch": 1.9234360410831, |
| "grad_norm": 2.48492431640625, |
| "learning_rate": 4.871499841889179e-05, |
| "loss": 0.9028, |
| "num_input_tokens_seen": 266384, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.9281045751633987, |
| "grad_norm": 5.875805854797363, |
| "learning_rate": 4.870207342746889e-05, |
| "loss": 0.5761, |
| "num_input_tokens_seen": 267040, |
| "step": 2065 |
| }, |
| { |
| "epoch": 1.9327731092436975, |
| "grad_norm": 5.392336845397949, |
| "learning_rate": 4.868908549093022e-05, |
| "loss": 0.624, |
| "num_input_tokens_seen": 267712, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.9374416433239963, |
| "grad_norm": 6.086864948272705, |
| "learning_rate": 4.867603464376759e-05, |
| "loss": 0.4388, |
| "num_input_tokens_seen": 268320, |
| "step": 2075 |
| }, |
| { |
| "epoch": 1.942110177404295, |
| "grad_norm": 12.777872085571289, |
| "learning_rate": 4.8662920920639866e-05, |
| "loss": 0.9118, |
| "num_input_tokens_seen": 268880, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.9467787114845938, |
| "grad_norm": 5.037237167358398, |
| "learning_rate": 4.864974435637289e-05, |
| "loss": 0.5265, |
| "num_input_tokens_seen": 269552, |
| "step": 2085 |
| }, |
| { |
| "epoch": 1.9514472455648926, |
| "grad_norm": 7.752954006195068, |
| "learning_rate": 4.863650498595941e-05, |
| "loss": 0.4638, |
| "num_input_tokens_seen": 270144, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.9561157796451916, |
| "grad_norm": 2.958127021789551, |
| "learning_rate": 4.862320284455894e-05, |
| "loss": 0.4215, |
| "num_input_tokens_seen": 270800, |
| "step": 2095 |
| }, |
| { |
| "epoch": 1.9607843137254903, |
| "grad_norm": 15.195981979370117, |
| "learning_rate": 4.860983796749771e-05, |
| "loss": 0.811, |
| "num_input_tokens_seen": 271456, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.965452847805789, |
| "grad_norm": 1.6306458711624146, |
| "learning_rate": 4.859641039026856e-05, |
| "loss": 0.6108, |
| "num_input_tokens_seen": 272016, |
| "step": 2105 |
| }, |
| { |
| "epoch": 1.9701213818860879, |
| "grad_norm": 5.165258884429932, |
| "learning_rate": 4.858292014853083e-05, |
| "loss": 0.713, |
| "num_input_tokens_seen": 272544, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.9747899159663866, |
| "grad_norm": 4.176558971405029, |
| "learning_rate": 4.8569367278110284e-05, |
| "loss": 0.7166, |
| "num_input_tokens_seen": 273216, |
| "step": 2115 |
| }, |
| { |
| "epoch": 1.9794584500466854, |
| "grad_norm": 5.415503025054932, |
| "learning_rate": 4.8555751814998994e-05, |
| "loss": 0.5213, |
| "num_input_tokens_seen": 273856, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.9841269841269842, |
| "grad_norm": 5.2978105545043945, |
| "learning_rate": 4.8542073795355294e-05, |
| "loss": 0.7801, |
| "num_input_tokens_seen": 274496, |
| "step": 2125 |
| }, |
| { |
| "epoch": 1.988795518207283, |
| "grad_norm": 15.461605072021484, |
| "learning_rate": 4.85283332555036e-05, |
| "loss": 0.6693, |
| "num_input_tokens_seen": 275072, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.9934640522875817, |
| "grad_norm": 4.298515319824219, |
| "learning_rate": 4.8514530231934385e-05, |
| "loss": 0.8724, |
| "num_input_tokens_seen": 275744, |
| "step": 2135 |
| }, |
| { |
| "epoch": 1.9981325863678805, |
| "grad_norm": 9.890350341796875, |
| "learning_rate": 4.850066476130407e-05, |
| "loss": 0.4097, |
| "num_input_tokens_seen": 276368, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.0018674136321195, |
| "eval_loss": 0.6239609122276306, |
| "eval_runtime": 3.8759, |
| "eval_samples_per_second": 61.405, |
| "eval_steps_per_second": 30.702, |
| "num_input_tokens_seen": 276856, |
| "step": 2144 |
| }, |
| { |
| "epoch": 2.0028011204481793, |
| "grad_norm": 5.202558994293213, |
| "learning_rate": 4.84867368804349e-05, |
| "loss": 0.4272, |
| "num_input_tokens_seen": 276984, |
| "step": 2145 |
| }, |
| { |
| "epoch": 2.007469654528478, |
| "grad_norm": 5.1944732666015625, |
| "learning_rate": 4.847274662631487e-05, |
| "loss": 0.6998, |
| "num_input_tokens_seen": 277544, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.012138188608777, |
| "grad_norm": 6.575268268585205, |
| "learning_rate": 4.8458694036097604e-05, |
| "loss": 0.5162, |
| "num_input_tokens_seen": 278264, |
| "step": 2155 |
| }, |
| { |
| "epoch": 2.0168067226890756, |
| "grad_norm": 4.267121315002441, |
| "learning_rate": 4.84445791471023e-05, |
| "loss": 0.5913, |
| "num_input_tokens_seen": 278888, |
| "step": 2160 |
| }, |
| { |
| "epoch": 2.0214752567693743, |
| "grad_norm": 5.02635383605957, |
| "learning_rate": 4.843040199681356e-05, |
| "loss": 0.8597, |
| "num_input_tokens_seen": 279576, |
| "step": 2165 |
| }, |
| { |
| "epoch": 2.026143790849673, |
| "grad_norm": 8.290732383728027, |
| "learning_rate": 4.8416162622881367e-05, |
| "loss": 0.396, |
| "num_input_tokens_seen": 280216, |
| "step": 2170 |
| }, |
| { |
| "epoch": 2.030812324929972, |
| "grad_norm": 3.7890634536743164, |
| "learning_rate": 4.840186106312094e-05, |
| "loss": 0.4928, |
| "num_input_tokens_seen": 280936, |
| "step": 2175 |
| }, |
| { |
| "epoch": 2.0354808590102706, |
| "grad_norm": 5.653308391571045, |
| "learning_rate": 4.8387497355512625e-05, |
| "loss": 0.4655, |
| "num_input_tokens_seen": 281560, |
| "step": 2180 |
| }, |
| { |
| "epoch": 2.0401493930905694, |
| "grad_norm": 13.255468368530273, |
| "learning_rate": 4.837307153820184e-05, |
| "loss": 0.8693, |
| "num_input_tokens_seen": 282200, |
| "step": 2185 |
| }, |
| { |
| "epoch": 2.044817927170868, |
| "grad_norm": 8.845050811767578, |
| "learning_rate": 4.835858364949894e-05, |
| "loss": 0.4014, |
| "num_input_tokens_seen": 282840, |
| "step": 2190 |
| }, |
| { |
| "epoch": 2.049486461251167, |
| "grad_norm": 2.6028687953948975, |
| "learning_rate": 4.834403372787912e-05, |
| "loss": 0.4387, |
| "num_input_tokens_seen": 283496, |
| "step": 2195 |
| }, |
| { |
| "epoch": 2.0541549953314657, |
| "grad_norm": 19.733068466186523, |
| "learning_rate": 4.83294218119823e-05, |
| "loss": 0.7063, |
| "num_input_tokens_seen": 284136, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.0588235294117645, |
| "grad_norm": 4.527667999267578, |
| "learning_rate": 4.831474794061305e-05, |
| "loss": 0.694, |
| "num_input_tokens_seen": 284744, |
| "step": 2205 |
| }, |
| { |
| "epoch": 2.0634920634920633, |
| "grad_norm": 1.302569031715393, |
| "learning_rate": 4.830001215274048e-05, |
| "loss": 0.2824, |
| "num_input_tokens_seen": 285352, |
| "step": 2210 |
| }, |
| { |
| "epoch": 2.0681605975723625, |
| "grad_norm": 6.566271781921387, |
| "learning_rate": 4.828521448749812e-05, |
| "loss": 0.8398, |
| "num_input_tokens_seen": 285992, |
| "step": 2215 |
| }, |
| { |
| "epoch": 2.0728291316526612, |
| "grad_norm": 5.239344120025635, |
| "learning_rate": 4.827035498418382e-05, |
| "loss": 0.6146, |
| "num_input_tokens_seen": 286616, |
| "step": 2220 |
| }, |
| { |
| "epoch": 2.07749766573296, |
| "grad_norm": 3.5977797508239746, |
| "learning_rate": 4.8255433682259685e-05, |
| "loss": 0.4369, |
| "num_input_tokens_seen": 287256, |
| "step": 2225 |
| }, |
| { |
| "epoch": 2.082166199813259, |
| "grad_norm": 7.737928867340088, |
| "learning_rate": 4.824045062135189e-05, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 287864, |
| "step": 2230 |
| }, |
| { |
| "epoch": 2.0868347338935576, |
| "grad_norm": 8.302957534790039, |
| "learning_rate": 4.822540584125066e-05, |
| "loss": 0.4181, |
| "num_input_tokens_seen": 288456, |
| "step": 2235 |
| }, |
| { |
| "epoch": 2.0915032679738563, |
| "grad_norm": 2.4553043842315674, |
| "learning_rate": 4.82102993819101e-05, |
| "loss": 0.4067, |
| "num_input_tokens_seen": 289128, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.096171802054155, |
| "grad_norm": 4.2813544273376465, |
| "learning_rate": 4.819513128344814e-05, |
| "loss": 0.5259, |
| "num_input_tokens_seen": 289704, |
| "step": 2245 |
| }, |
| { |
| "epoch": 2.100840336134454, |
| "grad_norm": 2.3208742141723633, |
| "learning_rate": 4.8179901586146385e-05, |
| "loss": 0.4714, |
| "num_input_tokens_seen": 290360, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.1055088702147526, |
| "grad_norm": 8.080585479736328, |
| "learning_rate": 4.816461033045004e-05, |
| "loss": 0.6754, |
| "num_input_tokens_seen": 291080, |
| "step": 2255 |
| }, |
| { |
| "epoch": 2.1101774042950514, |
| "grad_norm": 2.4208004474639893, |
| "learning_rate": 4.8149257556967774e-05, |
| "loss": 0.28, |
| "num_input_tokens_seen": 291752, |
| "step": 2260 |
| }, |
| { |
| "epoch": 2.11484593837535, |
| "grad_norm": 3.9821560382843018, |
| "learning_rate": 4.813384330647164e-05, |
| "loss": 0.4721, |
| "num_input_tokens_seen": 292408, |
| "step": 2265 |
| }, |
| { |
| "epoch": 2.119514472455649, |
| "grad_norm": 17.761463165283203, |
| "learning_rate": 4.8118367619896956e-05, |
| "loss": 0.5283, |
| "num_input_tokens_seen": 293112, |
| "step": 2270 |
| }, |
| { |
| "epoch": 2.1241830065359477, |
| "grad_norm": 3.123309373855591, |
| "learning_rate": 4.8102830538342176e-05, |
| "loss": 0.4104, |
| "num_input_tokens_seen": 293768, |
| "step": 2275 |
| }, |
| { |
| "epoch": 2.1288515406162465, |
| "grad_norm": 4.399372100830078, |
| "learning_rate": 4.808723210306882e-05, |
| "loss": 0.3421, |
| "num_input_tokens_seen": 294472, |
| "step": 2280 |
| }, |
| { |
| "epoch": 2.1335200746965453, |
| "grad_norm": 13.870634078979492, |
| "learning_rate": 4.807157235550134e-05, |
| "loss": 0.4928, |
| "num_input_tokens_seen": 295240, |
| "step": 2285 |
| }, |
| { |
| "epoch": 2.138188608776844, |
| "grad_norm": 9.316656112670898, |
| "learning_rate": 4.8055851337227006e-05, |
| "loss": 0.3461, |
| "num_input_tokens_seen": 295880, |
| "step": 2290 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 6.0931596755981445, |
| "learning_rate": 4.804006908999581e-05, |
| "loss": 0.7741, |
| "num_input_tokens_seen": 296568, |
| "step": 2295 |
| }, |
| { |
| "epoch": 2.1475256769374416, |
| "grad_norm": 3.882946729660034, |
| "learning_rate": 4.802422565572034e-05, |
| "loss": 0.3851, |
| "num_input_tokens_seen": 297208, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.1521942110177403, |
| "grad_norm": 4.334148406982422, |
| "learning_rate": 4.8008321076475694e-05, |
| "loss": 0.481, |
| "num_input_tokens_seen": 297976, |
| "step": 2305 |
| }, |
| { |
| "epoch": 2.156862745098039, |
| "grad_norm": 8.217869758605957, |
| "learning_rate": 4.799235539449932e-05, |
| "loss": 0.6381, |
| "num_input_tokens_seen": 298552, |
| "step": 2310 |
| }, |
| { |
| "epoch": 2.161531279178338, |
| "grad_norm": 12.515053749084473, |
| "learning_rate": 4.797632865219098e-05, |
| "loss": 0.7755, |
| "num_input_tokens_seen": 299128, |
| "step": 2315 |
| }, |
| { |
| "epoch": 2.1661998132586366, |
| "grad_norm": 6.713847637176514, |
| "learning_rate": 4.7960240892112554e-05, |
| "loss": 0.4644, |
| "num_input_tokens_seen": 299784, |
| "step": 2320 |
| }, |
| { |
| "epoch": 2.1708683473389354, |
| "grad_norm": 1.149377465248108, |
| "learning_rate": 4.794409215698799e-05, |
| "loss": 0.2835, |
| "num_input_tokens_seen": 300408, |
| "step": 2325 |
| }, |
| { |
| "epoch": 2.175536881419234, |
| "grad_norm": 6.214423179626465, |
| "learning_rate": 4.792788248970314e-05, |
| "loss": 0.5038, |
| "num_input_tokens_seen": 301064, |
| "step": 2330 |
| }, |
| { |
| "epoch": 2.180205415499533, |
| "grad_norm": 1.930323600769043, |
| "learning_rate": 4.7911611933305707e-05, |
| "loss": 0.2548, |
| "num_input_tokens_seen": 301736, |
| "step": 2335 |
| }, |
| { |
| "epoch": 2.184873949579832, |
| "grad_norm": 7.587453842163086, |
| "learning_rate": 4.7895280531005064e-05, |
| "loss": 0.5286, |
| "num_input_tokens_seen": 302280, |
| "step": 2340 |
| }, |
| { |
| "epoch": 2.189542483660131, |
| "grad_norm": 6.7828192710876465, |
| "learning_rate": 4.78788883261722e-05, |
| "loss": 0.4586, |
| "num_input_tokens_seen": 302872, |
| "step": 2345 |
| }, |
| { |
| "epoch": 2.1942110177404297, |
| "grad_norm": 15.733732223510742, |
| "learning_rate": 4.786243536233954e-05, |
| "loss": 0.9646, |
| "num_input_tokens_seen": 303464, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.1988795518207285, |
| "grad_norm": 2.523688554763794, |
| "learning_rate": 4.7845921683200905e-05, |
| "loss": 0.3255, |
| "num_input_tokens_seen": 304200, |
| "step": 2355 |
| }, |
| { |
| "epoch": 2.2035480859010272, |
| "grad_norm": 4.354966640472412, |
| "learning_rate": 4.782934733261133e-05, |
| "loss": 0.3044, |
| "num_input_tokens_seen": 304824, |
| "step": 2360 |
| }, |
| { |
| "epoch": 2.208216619981326, |
| "grad_norm": 2.6396069526672363, |
| "learning_rate": 4.781271235458699e-05, |
| "loss": 0.6225, |
| "num_input_tokens_seen": 305416, |
| "step": 2365 |
| }, |
| { |
| "epoch": 2.212885154061625, |
| "grad_norm": 11.276773452758789, |
| "learning_rate": 4.779601679330504e-05, |
| "loss": 0.6085, |
| "num_input_tokens_seen": 305992, |
| "step": 2370 |
| }, |
| { |
| "epoch": 2.2175536881419236, |
| "grad_norm": 2.625945568084717, |
| "learning_rate": 4.7779260693103556e-05, |
| "loss": 0.5244, |
| "num_input_tokens_seen": 306728, |
| "step": 2375 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 4.601404666900635, |
| "learning_rate": 4.776244409848138e-05, |
| "loss": 0.4817, |
| "num_input_tokens_seen": 307336, |
| "step": 2380 |
| }, |
| { |
| "epoch": 2.226890756302521, |
| "grad_norm": 10.985107421875, |
| "learning_rate": 4.774556705409799e-05, |
| "loss": 0.79, |
| "num_input_tokens_seen": 307960, |
| "step": 2385 |
| }, |
| { |
| "epoch": 2.23155929038282, |
| "grad_norm": 2.6938464641571045, |
| "learning_rate": 4.7728629604773415e-05, |
| "loss": 0.5564, |
| "num_input_tokens_seen": 308664, |
| "step": 2390 |
| }, |
| { |
| "epoch": 2.2362278244631186, |
| "grad_norm": 7.201427459716797, |
| "learning_rate": 4.7711631795488096e-05, |
| "loss": 0.2981, |
| "num_input_tokens_seen": 309320, |
| "step": 2395 |
| }, |
| { |
| "epoch": 2.2408963585434174, |
| "grad_norm": 1.2306467294692993, |
| "learning_rate": 4.769457367138277e-05, |
| "loss": 0.3292, |
| "num_input_tokens_seen": 309944, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.245564892623716, |
| "grad_norm": 4.115130424499512, |
| "learning_rate": 4.767745527775834e-05, |
| "loss": 0.8866, |
| "num_input_tokens_seen": 310568, |
| "step": 2405 |
| }, |
| { |
| "epoch": 2.250233426704015, |
| "grad_norm": 4.4345526695251465, |
| "learning_rate": 4.7660276660075804e-05, |
| "loss": 0.5751, |
| "num_input_tokens_seen": 311240, |
| "step": 2410 |
| }, |
| { |
| "epoch": 2.2549019607843137, |
| "grad_norm": 10.08226490020752, |
| "learning_rate": 4.764303786395604e-05, |
| "loss": 0.9322, |
| "num_input_tokens_seen": 311832, |
| "step": 2415 |
| }, |
| { |
| "epoch": 2.2595704948646125, |
| "grad_norm": 2.849057912826538, |
| "learning_rate": 4.7625738935179794e-05, |
| "loss": 0.5432, |
| "num_input_tokens_seen": 312504, |
| "step": 2420 |
| }, |
| { |
| "epoch": 2.2642390289449112, |
| "grad_norm": 8.79856014251709, |
| "learning_rate": 4.760837991968746e-05, |
| "loss": 0.4157, |
| "num_input_tokens_seen": 313160, |
| "step": 2425 |
| }, |
| { |
| "epoch": 2.26890756302521, |
| "grad_norm": 6.586191654205322, |
| "learning_rate": 4.7590960863579034e-05, |
| "loss": 0.4689, |
| "num_input_tokens_seen": 313768, |
| "step": 2430 |
| }, |
| { |
| "epoch": 2.273576097105509, |
| "grad_norm": 5.84085750579834, |
| "learning_rate": 4.757348181311394e-05, |
| "loss": 0.5085, |
| "num_input_tokens_seen": 314376, |
| "step": 2435 |
| }, |
| { |
| "epoch": 2.2782446311858076, |
| "grad_norm": 4.015573501586914, |
| "learning_rate": 4.7555942814710954e-05, |
| "loss": 0.491, |
| "num_input_tokens_seen": 315032, |
| "step": 2440 |
| }, |
| { |
| "epoch": 2.2829131652661063, |
| "grad_norm": 4.170641899108887, |
| "learning_rate": 4.7538343914948025e-05, |
| "loss": 0.4183, |
| "num_input_tokens_seen": 315656, |
| "step": 2445 |
| }, |
| { |
| "epoch": 2.287581699346405, |
| "grad_norm": 3.4333198070526123, |
| "learning_rate": 4.75206851605622e-05, |
| "loss": 0.386, |
| "num_input_tokens_seen": 316216, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.292250233426704, |
| "grad_norm": 4.160677433013916, |
| "learning_rate": 4.7502966598449475e-05, |
| "loss": 0.5694, |
| "num_input_tokens_seen": 316840, |
| "step": 2455 |
| }, |
| { |
| "epoch": 2.2969187675070026, |
| "grad_norm": 3.7964909076690674, |
| "learning_rate": 4.748518827566468e-05, |
| "loss": 0.4163, |
| "num_input_tokens_seen": 317448, |
| "step": 2460 |
| }, |
| { |
| "epoch": 2.3015873015873014, |
| "grad_norm": 7.525023937225342, |
| "learning_rate": 4.746735023942134e-05, |
| "loss": 0.4436, |
| "num_input_tokens_seen": 318056, |
| "step": 2465 |
| }, |
| { |
| "epoch": 2.3062558356676, |
| "grad_norm": 13.83182430267334, |
| "learning_rate": 4.744945253709156e-05, |
| "loss": 0.6373, |
| "num_input_tokens_seen": 318728, |
| "step": 2470 |
| }, |
| { |
| "epoch": 2.310924369747899, |
| "grad_norm": 1.9470397233963013, |
| "learning_rate": 4.743149521620591e-05, |
| "loss": 1.0598, |
| "num_input_tokens_seen": 319320, |
| "step": 2475 |
| }, |
| { |
| "epoch": 2.3155929038281977, |
| "grad_norm": 3.6188297271728516, |
| "learning_rate": 4.7413478324453296e-05, |
| "loss": 0.4637, |
| "num_input_tokens_seen": 319928, |
| "step": 2480 |
| }, |
| { |
| "epoch": 2.3202614379084965, |
| "grad_norm": 9.640084266662598, |
| "learning_rate": 4.7395401909680805e-05, |
| "loss": 0.7186, |
| "num_input_tokens_seen": 320456, |
| "step": 2485 |
| }, |
| { |
| "epoch": 2.3249299719887957, |
| "grad_norm": 4.796760082244873, |
| "learning_rate": 4.737726601989359e-05, |
| "loss": 0.2771, |
| "num_input_tokens_seen": 321192, |
| "step": 2490 |
| }, |
| { |
| "epoch": 2.3295985060690945, |
| "grad_norm": 3.5131568908691406, |
| "learning_rate": 4.735907070325478e-05, |
| "loss": 0.6796, |
| "num_input_tokens_seen": 321816, |
| "step": 2495 |
| }, |
| { |
| "epoch": 2.3342670401493932, |
| "grad_norm": 9.193059921264648, |
| "learning_rate": 4.734081600808531e-05, |
| "loss": 0.6376, |
| "num_input_tokens_seen": 322440, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.338935574229692, |
| "grad_norm": 2.209547758102417, |
| "learning_rate": 4.73225019828638e-05, |
| "loss": 0.3292, |
| "num_input_tokens_seen": 323128, |
| "step": 2505 |
| }, |
| { |
| "epoch": 2.3436041083099908, |
| "grad_norm": 9.847227096557617, |
| "learning_rate": 4.7304128676226426e-05, |
| "loss": 0.366, |
| "num_input_tokens_seen": 323832, |
| "step": 2510 |
| }, |
| { |
| "epoch": 2.3482726423902895, |
| "grad_norm": 2.2758820056915283, |
| "learning_rate": 4.728569613696683e-05, |
| "loss": 0.3458, |
| "num_input_tokens_seen": 324376, |
| "step": 2515 |
| }, |
| { |
| "epoch": 2.3529411764705883, |
| "grad_norm": 4.512838363647461, |
| "learning_rate": 4.72672044140359e-05, |
| "loss": 0.3957, |
| "num_input_tokens_seen": 325048, |
| "step": 2520 |
| }, |
| { |
| "epoch": 2.357609710550887, |
| "grad_norm": 13.972112655639648, |
| "learning_rate": 4.724865355654176e-05, |
| "loss": 0.745, |
| "num_input_tokens_seen": 325624, |
| "step": 2525 |
| }, |
| { |
| "epoch": 2.362278244631186, |
| "grad_norm": 1.9871619939804077, |
| "learning_rate": 4.723004361374953e-05, |
| "loss": 0.3958, |
| "num_input_tokens_seen": 326312, |
| "step": 2530 |
| }, |
| { |
| "epoch": 2.3669467787114846, |
| "grad_norm": 12.324496269226074, |
| "learning_rate": 4.7211374635081264e-05, |
| "loss": 0.4743, |
| "num_input_tokens_seen": 326968, |
| "step": 2535 |
| }, |
| { |
| "epoch": 2.3716153127917834, |
| "grad_norm": 10.762685775756836, |
| "learning_rate": 4.719264667011578e-05, |
| "loss": 0.5992, |
| "num_input_tokens_seen": 327512, |
| "step": 2540 |
| }, |
| { |
| "epoch": 2.376283846872082, |
| "grad_norm": 6.492682456970215, |
| "learning_rate": 4.717385976858857e-05, |
| "loss": 0.4598, |
| "num_input_tokens_seen": 328296, |
| "step": 2545 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 14.06905746459961, |
| "learning_rate": 4.715501398039162e-05, |
| "loss": 0.5516, |
| "num_input_tokens_seen": 328808, |
| "step": 2550 |
| }, |
| { |
| "epoch": 2.3856209150326797, |
| "grad_norm": 6.21990966796875, |
| "learning_rate": 4.71361093555733e-05, |
| "loss": 0.7209, |
| "num_input_tokens_seen": 329496, |
| "step": 2555 |
| }, |
| { |
| "epoch": 2.3902894491129785, |
| "grad_norm": 5.462920188903809, |
| "learning_rate": 4.711714594433825e-05, |
| "loss": 0.362, |
| "num_input_tokens_seen": 330216, |
| "step": 2560 |
| }, |
| { |
| "epoch": 2.3949579831932772, |
| "grad_norm": 2.444115161895752, |
| "learning_rate": 4.7098123797047214e-05, |
| "loss": 1.1321, |
| "num_input_tokens_seen": 330840, |
| "step": 2565 |
| }, |
| { |
| "epoch": 2.399626517273576, |
| "grad_norm": 3.1685636043548584, |
| "learning_rate": 4.7079042964216916e-05, |
| "loss": 0.4157, |
| "num_input_tokens_seen": 331512, |
| "step": 2570 |
| }, |
| { |
| "epoch": 2.404295051353875, |
| "grad_norm": 4.733340263366699, |
| "learning_rate": 4.705990349651994e-05, |
| "loss": 0.6675, |
| "num_input_tokens_seen": 332232, |
| "step": 2575 |
| }, |
| { |
| "epoch": 2.4089635854341735, |
| "grad_norm": 6.854113578796387, |
| "learning_rate": 4.704070544478459e-05, |
| "loss": 0.4659, |
| "num_input_tokens_seen": 332872, |
| "step": 2580 |
| }, |
| { |
| "epoch": 2.4136321195144723, |
| "grad_norm": 7.259758949279785, |
| "learning_rate": 4.7021448859994735e-05, |
| "loss": 0.7414, |
| "num_input_tokens_seen": 333560, |
| "step": 2585 |
| }, |
| { |
| "epoch": 2.418300653594771, |
| "grad_norm": 5.3515424728393555, |
| "learning_rate": 4.70021337932897e-05, |
| "loss": 0.6076, |
| "num_input_tokens_seen": 334168, |
| "step": 2590 |
| }, |
| { |
| "epoch": 2.42296918767507, |
| "grad_norm": 19.960798263549805, |
| "learning_rate": 4.698276029596411e-05, |
| "loss": 0.8932, |
| "num_input_tokens_seen": 334856, |
| "step": 2595 |
| }, |
| { |
| "epoch": 2.4276377217553686, |
| "grad_norm": 4.575296401977539, |
| "learning_rate": 4.696332841946778e-05, |
| "loss": 0.3443, |
| "num_input_tokens_seen": 335560, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.432306255835668, |
| "grad_norm": 2.0907132625579834, |
| "learning_rate": 4.694383821540555e-05, |
| "loss": 0.358, |
| "num_input_tokens_seen": 336216, |
| "step": 2605 |
| }, |
| { |
| "epoch": 2.4369747899159666, |
| "grad_norm": 2.8520407676696777, |
| "learning_rate": 4.6924289735537144e-05, |
| "loss": 0.3586, |
| "num_input_tokens_seen": 336808, |
| "step": 2610 |
| }, |
| { |
| "epoch": 2.4416433239962654, |
| "grad_norm": 8.168237686157227, |
| "learning_rate": 4.690468303177706e-05, |
| "loss": 0.5266, |
| "num_input_tokens_seen": 337480, |
| "step": 2615 |
| }, |
| { |
| "epoch": 2.446311858076564, |
| "grad_norm": 2.263343334197998, |
| "learning_rate": 4.688501815619446e-05, |
| "loss": 0.5147, |
| "num_input_tokens_seen": 338136, |
| "step": 2620 |
| }, |
| { |
| "epoch": 2.450980392156863, |
| "grad_norm": 6.954434871673584, |
| "learning_rate": 4.6865295161012926e-05, |
| "loss": 0.5564, |
| "num_input_tokens_seen": 338808, |
| "step": 2625 |
| }, |
| { |
| "epoch": 2.4556489262371617, |
| "grad_norm": 2.3235628604888916, |
| "learning_rate": 4.684551409861042e-05, |
| "loss": 0.4085, |
| "num_input_tokens_seen": 339528, |
| "step": 2630 |
| }, |
| { |
| "epoch": 2.4603174603174605, |
| "grad_norm": 4.111285209655762, |
| "learning_rate": 4.682567502151911e-05, |
| "loss": 0.7339, |
| "num_input_tokens_seen": 340264, |
| "step": 2635 |
| }, |
| { |
| "epoch": 2.4649859943977592, |
| "grad_norm": 5.659359455108643, |
| "learning_rate": 4.680577798242523e-05, |
| "loss": 0.5562, |
| "num_input_tokens_seen": 340888, |
| "step": 2640 |
| }, |
| { |
| "epoch": 2.469654528478058, |
| "grad_norm": 1.6552449464797974, |
| "learning_rate": 4.6785823034168955e-05, |
| "loss": 0.3952, |
| "num_input_tokens_seen": 341640, |
| "step": 2645 |
| }, |
| { |
| "epoch": 2.4743230625583568, |
| "grad_norm": 1.5371763706207275, |
| "learning_rate": 4.676581022974421e-05, |
| "loss": 0.4896, |
| "num_input_tokens_seen": 342280, |
| "step": 2650 |
| }, |
| { |
| "epoch": 2.4789915966386555, |
| "grad_norm": 0.7637246251106262, |
| "learning_rate": 4.674573962229862e-05, |
| "loss": 0.8027, |
| "num_input_tokens_seen": 342872, |
| "step": 2655 |
| }, |
| { |
| "epoch": 2.4836601307189543, |
| "grad_norm": 7.288429260253906, |
| "learning_rate": 4.672561126513328e-05, |
| "loss": 0.5183, |
| "num_input_tokens_seen": 343496, |
| "step": 2660 |
| }, |
| { |
| "epoch": 2.488328664799253, |
| "grad_norm": 7.098439693450928, |
| "learning_rate": 4.670542521170266e-05, |
| "loss": 0.6193, |
| "num_input_tokens_seen": 344136, |
| "step": 2665 |
| }, |
| { |
| "epoch": 2.492997198879552, |
| "grad_norm": 4.773173809051514, |
| "learning_rate": 4.6685181515614454e-05, |
| "loss": 0.5955, |
| "num_input_tokens_seen": 344808, |
| "step": 2670 |
| }, |
| { |
| "epoch": 2.4976657329598506, |
| "grad_norm": 3.8115127086639404, |
| "learning_rate": 4.666488023062943e-05, |
| "loss": 0.3639, |
| "num_input_tokens_seen": 345400, |
| "step": 2675 |
| }, |
| { |
| "epoch": 2.5023342670401494, |
| "grad_norm": 7.138146877288818, |
| "learning_rate": 4.664452141066131e-05, |
| "loss": 0.6925, |
| "num_input_tokens_seen": 346040, |
| "step": 2680 |
| }, |
| { |
| "epoch": 2.5023342670401494, |
| "eval_loss": 0.652177631855011, |
| "eval_runtime": 3.8707, |
| "eval_samples_per_second": 61.488, |
| "eval_steps_per_second": 30.744, |
| "num_input_tokens_seen": 346040, |
| "step": 2680 |
| }, |
| { |
| "epoch": 2.507002801120448, |
| "grad_norm": 3.9293606281280518, |
| "learning_rate": 4.662410510977659e-05, |
| "loss": 0.6155, |
| "num_input_tokens_seen": 346616, |
| "step": 2685 |
| }, |
| { |
| "epoch": 2.511671335200747, |
| "grad_norm": 10.7706880569458, |
| "learning_rate": 4.6603631382194426e-05, |
| "loss": 0.8976, |
| "num_input_tokens_seen": 347192, |
| "step": 2690 |
| }, |
| { |
| "epoch": 2.5163398692810457, |
| "grad_norm": 5.461066246032715, |
| "learning_rate": 4.658310028228649e-05, |
| "loss": 0.4815, |
| "num_input_tokens_seen": 347768, |
| "step": 2695 |
| }, |
| { |
| "epoch": 2.5210084033613445, |
| "grad_norm": 2.074761152267456, |
| "learning_rate": 4.65625118645768e-05, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 348360, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.5256769374416432, |
| "grad_norm": 3.5420331954956055, |
| "learning_rate": 4.654186618374159e-05, |
| "loss": 0.5325, |
| "num_input_tokens_seen": 348968, |
| "step": 2705 |
| }, |
| { |
| "epoch": 2.530345471521942, |
| "grad_norm": 0.8925294876098633, |
| "learning_rate": 4.6521163294609196e-05, |
| "loss": 0.695, |
| "num_input_tokens_seen": 349672, |
| "step": 2710 |
| }, |
| { |
| "epoch": 2.5350140056022408, |
| "grad_norm": 3.142289400100708, |
| "learning_rate": 4.650040325215985e-05, |
| "loss": 0.415, |
| "num_input_tokens_seen": 350328, |
| "step": 2715 |
| }, |
| { |
| "epoch": 2.5396825396825395, |
| "grad_norm": 8.260872840881348, |
| "learning_rate": 4.647958611152557e-05, |
| "loss": 0.59, |
| "num_input_tokens_seen": 350984, |
| "step": 2720 |
| }, |
| { |
| "epoch": 2.5443510737628383, |
| "grad_norm": 5.627845287322998, |
| "learning_rate": 4.645871192799004e-05, |
| "loss": 0.5929, |
| "num_input_tokens_seen": 351624, |
| "step": 2725 |
| }, |
| { |
| "epoch": 2.549019607843137, |
| "grad_norm": 6.843557834625244, |
| "learning_rate": 4.643778075698838e-05, |
| "loss": 0.4834, |
| "num_input_tokens_seen": 352264, |
| "step": 2730 |
| }, |
| { |
| "epoch": 2.553688141923436, |
| "grad_norm": 7.715879440307617, |
| "learning_rate": 4.6416792654107076e-05, |
| "loss": 0.7563, |
| "num_input_tokens_seen": 352920, |
| "step": 2735 |
| }, |
| { |
| "epoch": 2.5583566760037346, |
| "grad_norm": 4.674366474151611, |
| "learning_rate": 4.6395747675083825e-05, |
| "loss": 0.4375, |
| "num_input_tokens_seen": 353688, |
| "step": 2740 |
| }, |
| { |
| "epoch": 2.5630252100840334, |
| "grad_norm": 6.640374660491943, |
| "learning_rate": 4.637464587580734e-05, |
| "loss": 0.3506, |
| "num_input_tokens_seen": 354312, |
| "step": 2745 |
| }, |
| { |
| "epoch": 2.567693744164332, |
| "grad_norm": 2.6401913166046143, |
| "learning_rate": 4.6353487312317237e-05, |
| "loss": 0.6531, |
| "num_input_tokens_seen": 354888, |
| "step": 2750 |
| }, |
| { |
| "epoch": 2.572362278244631, |
| "grad_norm": 6.281817436218262, |
| "learning_rate": 4.6332272040803895e-05, |
| "loss": 0.4135, |
| "num_input_tokens_seen": 355480, |
| "step": 2755 |
| }, |
| { |
| "epoch": 2.5770308123249297, |
| "grad_norm": 4.7368388175964355, |
| "learning_rate": 4.631100011760827e-05, |
| "loss": 0.657, |
| "num_input_tokens_seen": 356072, |
| "step": 2760 |
| }, |
| { |
| "epoch": 2.581699346405229, |
| "grad_norm": 7.262160301208496, |
| "learning_rate": 4.628967159922178e-05, |
| "loss": 0.6438, |
| "num_input_tokens_seen": 356664, |
| "step": 2765 |
| }, |
| { |
| "epoch": 2.5863678804855277, |
| "grad_norm": 17.83985137939453, |
| "learning_rate": 4.626828654228615e-05, |
| "loss": 0.5256, |
| "num_input_tokens_seen": 357352, |
| "step": 2770 |
| }, |
| { |
| "epoch": 2.5910364145658265, |
| "grad_norm": 4.219756126403809, |
| "learning_rate": 4.624684500359323e-05, |
| "loss": 0.3764, |
| "num_input_tokens_seen": 357960, |
| "step": 2775 |
| }, |
| { |
| "epoch": 2.595704948646125, |
| "grad_norm": 4.930896282196045, |
| "learning_rate": 4.622534704008489e-05, |
| "loss": 1.1571, |
| "num_input_tokens_seen": 358584, |
| "step": 2780 |
| }, |
| { |
| "epoch": 2.600373482726424, |
| "grad_norm": 8.239065170288086, |
| "learning_rate": 4.620379270885282e-05, |
| "loss": 0.5915, |
| "num_input_tokens_seen": 359272, |
| "step": 2785 |
| }, |
| { |
| "epoch": 2.6050420168067228, |
| "grad_norm": 5.721827983856201, |
| "learning_rate": 4.6182182067138424e-05, |
| "loss": 0.4863, |
| "num_input_tokens_seen": 359992, |
| "step": 2790 |
| }, |
| { |
| "epoch": 2.6097105508870215, |
| "grad_norm": 5.4295654296875, |
| "learning_rate": 4.6160515172332655e-05, |
| "loss": 0.6015, |
| "num_input_tokens_seen": 360568, |
| "step": 2795 |
| }, |
| { |
| "epoch": 2.6143790849673203, |
| "grad_norm": 3.9932007789611816, |
| "learning_rate": 4.6138792081975846e-05, |
| "loss": 0.491, |
| "num_input_tokens_seen": 361272, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 7.950539588928223, |
| "learning_rate": 4.611701285375756e-05, |
| "loss": 0.6307, |
| "num_input_tokens_seen": 361992, |
| "step": 2805 |
| }, |
| { |
| "epoch": 2.623716153127918, |
| "grad_norm": 9.549372673034668, |
| "learning_rate": 4.609517754551644e-05, |
| "loss": 0.8141, |
| "num_input_tokens_seen": 362584, |
| "step": 2810 |
| }, |
| { |
| "epoch": 2.6283846872082166, |
| "grad_norm": 4.924779415130615, |
| "learning_rate": 4.6073286215240105e-05, |
| "loss": 0.7201, |
| "num_input_tokens_seen": 363208, |
| "step": 2815 |
| }, |
| { |
| "epoch": 2.6330532212885154, |
| "grad_norm": 6.979094505310059, |
| "learning_rate": 4.605133892106488e-05, |
| "loss": 0.4521, |
| "num_input_tokens_seen": 363816, |
| "step": 2820 |
| }, |
| { |
| "epoch": 2.637721755368814, |
| "grad_norm": 16.25713348388672, |
| "learning_rate": 4.602933572127578e-05, |
| "loss": 0.5296, |
| "num_input_tokens_seen": 364504, |
| "step": 2825 |
| }, |
| { |
| "epoch": 2.642390289449113, |
| "grad_norm": 1.5559110641479492, |
| "learning_rate": 4.600727667430624e-05, |
| "loss": 0.499, |
| "num_input_tokens_seen": 365176, |
| "step": 2830 |
| }, |
| { |
| "epoch": 2.6470588235294117, |
| "grad_norm": 3.178032875061035, |
| "learning_rate": 4.598516183873802e-05, |
| "loss": 0.6472, |
| "num_input_tokens_seen": 365800, |
| "step": 2835 |
| }, |
| { |
| "epoch": 2.6517273576097105, |
| "grad_norm": 30.63277816772461, |
| "learning_rate": 4.596299127330106e-05, |
| "loss": 0.7986, |
| "num_input_tokens_seen": 366392, |
| "step": 2840 |
| }, |
| { |
| "epoch": 2.6563958916900092, |
| "grad_norm": 0.5623080134391785, |
| "learning_rate": 4.594076503687326e-05, |
| "loss": 0.5632, |
| "num_input_tokens_seen": 366984, |
| "step": 2845 |
| }, |
| { |
| "epoch": 2.661064425770308, |
| "grad_norm": 18.52434539794922, |
| "learning_rate": 4.591848318848039e-05, |
| "loss": 0.6165, |
| "num_input_tokens_seen": 367576, |
| "step": 2850 |
| }, |
| { |
| "epoch": 2.6657329598506068, |
| "grad_norm": 8.841620445251465, |
| "learning_rate": 4.589614578729591e-05, |
| "loss": 0.3239, |
| "num_input_tokens_seen": 368360, |
| "step": 2855 |
| }, |
| { |
| "epoch": 2.6704014939309055, |
| "grad_norm": 0.7548959851264954, |
| "learning_rate": 4.5873752892640796e-05, |
| "loss": 0.3458, |
| "num_input_tokens_seen": 369032, |
| "step": 2860 |
| }, |
| { |
| "epoch": 2.6750700280112047, |
| "grad_norm": 11.500990867614746, |
| "learning_rate": 4.5851304563983414e-05, |
| "loss": 0.6321, |
| "num_input_tokens_seen": 369688, |
| "step": 2865 |
| }, |
| { |
| "epoch": 2.6797385620915035, |
| "grad_norm": 1.3931317329406738, |
| "learning_rate": 4.582880086093933e-05, |
| "loss": 0.3262, |
| "num_input_tokens_seen": 370392, |
| "step": 2870 |
| }, |
| { |
| "epoch": 2.6844070961718023, |
| "grad_norm": 9.6282320022583, |
| "learning_rate": 4.5806241843271166e-05, |
| "loss": 0.678, |
| "num_input_tokens_seen": 370984, |
| "step": 2875 |
| }, |
| { |
| "epoch": 2.689075630252101, |
| "grad_norm": 2.316354990005493, |
| "learning_rate": 4.578362757088846e-05, |
| "loss": 0.4045, |
| "num_input_tokens_seen": 371720, |
| "step": 2880 |
| }, |
| { |
| "epoch": 2.6937441643324, |
| "grad_norm": 8.465997695922852, |
| "learning_rate": 4.5760958103847455e-05, |
| "loss": 0.4269, |
| "num_input_tokens_seen": 372392, |
| "step": 2885 |
| }, |
| { |
| "epoch": 2.6984126984126986, |
| "grad_norm": 1.1068583726882935, |
| "learning_rate": 4.573823350235102e-05, |
| "loss": 0.2869, |
| "num_input_tokens_seen": 373000, |
| "step": 2890 |
| }, |
| { |
| "epoch": 2.7030812324929974, |
| "grad_norm": 5.867659091949463, |
| "learning_rate": 4.57154538267484e-05, |
| "loss": 0.4467, |
| "num_input_tokens_seen": 373672, |
| "step": 2895 |
| }, |
| { |
| "epoch": 2.707749766573296, |
| "grad_norm": 2.448972702026367, |
| "learning_rate": 4.5692619137535134e-05, |
| "loss": 0.5245, |
| "num_input_tokens_seen": 374344, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.712418300653595, |
| "grad_norm": 7.908166885375977, |
| "learning_rate": 4.566972949535283e-05, |
| "loss": 0.5952, |
| "num_input_tokens_seen": 375000, |
| "step": 2905 |
| }, |
| { |
| "epoch": 2.7170868347338937, |
| "grad_norm": 2.1407413482666016, |
| "learning_rate": 4.5646784960989054e-05, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 375560, |
| "step": 2910 |
| }, |
| { |
| "epoch": 2.7217553688141924, |
| "grad_norm": 2.4962363243103027, |
| "learning_rate": 4.562378559537714e-05, |
| "loss": 0.6708, |
| "num_input_tokens_seen": 376280, |
| "step": 2915 |
| }, |
| { |
| "epoch": 2.726423902894491, |
| "grad_norm": 10.618359565734863, |
| "learning_rate": 4.560073145959602e-05, |
| "loss": 0.5289, |
| "num_input_tokens_seen": 376968, |
| "step": 2920 |
| }, |
| { |
| "epoch": 2.73109243697479, |
| "grad_norm": 3.9154977798461914, |
| "learning_rate": 4.557762261487013e-05, |
| "loss": 0.9578, |
| "num_input_tokens_seen": 377592, |
| "step": 2925 |
| }, |
| { |
| "epoch": 2.7357609710550888, |
| "grad_norm": 5.653745651245117, |
| "learning_rate": 4.5554459122569124e-05, |
| "loss": 0.4755, |
| "num_input_tokens_seen": 378376, |
| "step": 2930 |
| }, |
| { |
| "epoch": 2.7404295051353875, |
| "grad_norm": 4.4318976402282715, |
| "learning_rate": 4.553124104420784e-05, |
| "loss": 0.3741, |
| "num_input_tokens_seen": 378968, |
| "step": 2935 |
| }, |
| { |
| "epoch": 2.7450980392156863, |
| "grad_norm": 2.4929981231689453, |
| "learning_rate": 4.550796844144605e-05, |
| "loss": 0.3582, |
| "num_input_tokens_seen": 379608, |
| "step": 2940 |
| }, |
| { |
| "epoch": 2.749766573295985, |
| "grad_norm": 4.1619486808776855, |
| "learning_rate": 4.548464137608834e-05, |
| "loss": 0.6926, |
| "num_input_tokens_seen": 380232, |
| "step": 2945 |
| }, |
| { |
| "epoch": 2.754435107376284, |
| "grad_norm": 2.7385575771331787, |
| "learning_rate": 4.546125991008392e-05, |
| "loss": 0.5283, |
| "num_input_tokens_seen": 380936, |
| "step": 2950 |
| }, |
| { |
| "epoch": 2.7591036414565826, |
| "grad_norm": 2.4300732612609863, |
| "learning_rate": 4.5437824105526474e-05, |
| "loss": 0.5301, |
| "num_input_tokens_seen": 381560, |
| "step": 2955 |
| }, |
| { |
| "epoch": 2.7637721755368814, |
| "grad_norm": 4.369988441467285, |
| "learning_rate": 4.541433402465399e-05, |
| "loss": 0.5472, |
| "num_input_tokens_seen": 382264, |
| "step": 2960 |
| }, |
| { |
| "epoch": 2.76844070961718, |
| "grad_norm": 3.6987788677215576, |
| "learning_rate": 4.5390789729848605e-05, |
| "loss": 0.6572, |
| "num_input_tokens_seen": 382936, |
| "step": 2965 |
| }, |
| { |
| "epoch": 2.773109243697479, |
| "grad_norm": 7.901442050933838, |
| "learning_rate": 4.5367191283636426e-05, |
| "loss": 0.6978, |
| "num_input_tokens_seen": 383528, |
| "step": 2970 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 12.086589813232422, |
| "learning_rate": 4.534353874868736e-05, |
| "loss": 0.4766, |
| "num_input_tokens_seen": 384104, |
| "step": 2975 |
| }, |
| { |
| "epoch": 2.7824463118580764, |
| "grad_norm": 4.757928848266602, |
| "learning_rate": 4.531983218781498e-05, |
| "loss": 0.5683, |
| "num_input_tokens_seen": 384664, |
| "step": 2980 |
| }, |
| { |
| "epoch": 2.787114845938375, |
| "grad_norm": 2.6184375286102295, |
| "learning_rate": 4.52960716639763e-05, |
| "loss": 0.4537, |
| "num_input_tokens_seen": 385288, |
| "step": 2985 |
| }, |
| { |
| "epoch": 2.791783380018674, |
| "grad_norm": 3.0726981163024902, |
| "learning_rate": 4.5272257240271676e-05, |
| "loss": 0.4201, |
| "num_input_tokens_seen": 385928, |
| "step": 2990 |
| }, |
| { |
| "epoch": 2.7964519140989728, |
| "grad_norm": 7.38121223449707, |
| "learning_rate": 4.524838897994458e-05, |
| "loss": 0.4057, |
| "num_input_tokens_seen": 386552, |
| "step": 2995 |
| }, |
| { |
| "epoch": 2.8011204481792715, |
| "grad_norm": 9.7579927444458, |
| "learning_rate": 4.5224466946381476e-05, |
| "loss": 0.6116, |
| "num_input_tokens_seen": 387144, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.8057889822595703, |
| "grad_norm": 2.8062477111816406, |
| "learning_rate": 4.520049120311162e-05, |
| "loss": 0.4844, |
| "num_input_tokens_seen": 387704, |
| "step": 3005 |
| }, |
| { |
| "epoch": 2.810457516339869, |
| "grad_norm": 5.818384647369385, |
| "learning_rate": 4.5176461813806904e-05, |
| "loss": 0.2765, |
| "num_input_tokens_seen": 388248, |
| "step": 3010 |
| }, |
| { |
| "epoch": 2.815126050420168, |
| "grad_norm": 3.2478232383728027, |
| "learning_rate": 4.5152378842281694e-05, |
| "loss": 0.5931, |
| "num_input_tokens_seen": 388952, |
| "step": 3015 |
| }, |
| { |
| "epoch": 2.8197945845004666, |
| "grad_norm": 1.4422049522399902, |
| "learning_rate": 4.512824235249265e-05, |
| "loss": 0.4764, |
| "num_input_tokens_seen": 389768, |
| "step": 3020 |
| }, |
| { |
| "epoch": 2.8244631185807654, |
| "grad_norm": 3.1220462322235107, |
| "learning_rate": 4.510405240853854e-05, |
| "loss": 0.6353, |
| "num_input_tokens_seen": 390408, |
| "step": 3025 |
| }, |
| { |
| "epoch": 2.8291316526610646, |
| "grad_norm": 5.443966865539551, |
| "learning_rate": 4.507980907466014e-05, |
| "loss": 0.6038, |
| "num_input_tokens_seen": 391112, |
| "step": 3030 |
| }, |
| { |
| "epoch": 2.8338001867413634, |
| "grad_norm": 10.735891342163086, |
| "learning_rate": 4.505551241523996e-05, |
| "loss": 0.4195, |
| "num_input_tokens_seen": 391752, |
| "step": 3035 |
| }, |
| { |
| "epoch": 2.838468720821662, |
| "grad_norm": 1.7903181314468384, |
| "learning_rate": 4.503116249480215e-05, |
| "loss": 0.5118, |
| "num_input_tokens_seen": 392456, |
| "step": 3040 |
| }, |
| { |
| "epoch": 2.843137254901961, |
| "grad_norm": 2.6636769771575928, |
| "learning_rate": 4.500675937801229e-05, |
| "loss": 0.4882, |
| "num_input_tokens_seen": 393112, |
| "step": 3045 |
| }, |
| { |
| "epoch": 2.8478057889822597, |
| "grad_norm": 6.658154487609863, |
| "learning_rate": 4.498230312967726e-05, |
| "loss": 0.4428, |
| "num_input_tokens_seen": 393768, |
| "step": 3050 |
| }, |
| { |
| "epoch": 2.8524743230625584, |
| "grad_norm": 4.922386169433594, |
| "learning_rate": 4.495779381474502e-05, |
| "loss": 0.5097, |
| "num_input_tokens_seen": 394424, |
| "step": 3055 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 11.311652183532715, |
| "learning_rate": 4.4933231498304445e-05, |
| "loss": 0.6931, |
| "num_input_tokens_seen": 395096, |
| "step": 3060 |
| }, |
| { |
| "epoch": 2.861811391223156, |
| "grad_norm": 4.589713096618652, |
| "learning_rate": 4.490861624558519e-05, |
| "loss": 0.2915, |
| "num_input_tokens_seen": 395880, |
| "step": 3065 |
| }, |
| { |
| "epoch": 2.8664799253034547, |
| "grad_norm": 3.9051504135131836, |
| "learning_rate": 4.488394812195749e-05, |
| "loss": 0.5947, |
| "num_input_tokens_seen": 396520, |
| "step": 3070 |
| }, |
| { |
| "epoch": 2.8711484593837535, |
| "grad_norm": 5.779489517211914, |
| "learning_rate": 4.4859227192931974e-05, |
| "loss": 0.7912, |
| "num_input_tokens_seen": 397240, |
| "step": 3075 |
| }, |
| { |
| "epoch": 2.8758169934640523, |
| "grad_norm": 8.4837064743042, |
| "learning_rate": 4.483445352415951e-05, |
| "loss": 0.6806, |
| "num_input_tokens_seen": 397896, |
| "step": 3080 |
| }, |
| { |
| "epoch": 2.880485527544351, |
| "grad_norm": 6.633236408233643, |
| "learning_rate": 4.480962718143102e-05, |
| "loss": 1.0596, |
| "num_input_tokens_seen": 398504, |
| "step": 3085 |
| }, |
| { |
| "epoch": 2.88515406162465, |
| "grad_norm": 4.59561824798584, |
| "learning_rate": 4.4784748230677344e-05, |
| "loss": 0.7187, |
| "num_input_tokens_seen": 399192, |
| "step": 3090 |
| }, |
| { |
| "epoch": 2.8898225957049486, |
| "grad_norm": 2.5471694469451904, |
| "learning_rate": 4.475981673796899e-05, |
| "loss": 0.543, |
| "num_input_tokens_seen": 399848, |
| "step": 3095 |
| }, |
| { |
| "epoch": 2.8944911297852474, |
| "grad_norm": 5.1371893882751465, |
| "learning_rate": 4.4734832769516014e-05, |
| "loss": 0.8593, |
| "num_input_tokens_seen": 400392, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.899159663865546, |
| "grad_norm": 4.461278438568115, |
| "learning_rate": 4.470979639166784e-05, |
| "loss": 0.5156, |
| "num_input_tokens_seen": 401016, |
| "step": 3105 |
| }, |
| { |
| "epoch": 2.903828197945845, |
| "grad_norm": 3.752699851989746, |
| "learning_rate": 4.468470767091306e-05, |
| "loss": 0.7429, |
| "num_input_tokens_seen": 401608, |
| "step": 3110 |
| }, |
| { |
| "epoch": 2.9084967320261437, |
| "grad_norm": 7.799002170562744, |
| "learning_rate": 4.46595666738793e-05, |
| "loss": 0.4073, |
| "num_input_tokens_seen": 402328, |
| "step": 3115 |
| }, |
| { |
| "epoch": 2.9131652661064424, |
| "grad_norm": 1.570083498954773, |
| "learning_rate": 4.4634373467332994e-05, |
| "loss": 0.6496, |
| "num_input_tokens_seen": 402984, |
| "step": 3120 |
| }, |
| { |
| "epoch": 2.917833800186741, |
| "grad_norm": 1.5849401950836182, |
| "learning_rate": 4.46091281181792e-05, |
| "loss": 0.5304, |
| "num_input_tokens_seen": 403592, |
| "step": 3125 |
| }, |
| { |
| "epoch": 2.9225023342670404, |
| "grad_norm": 5.008321762084961, |
| "learning_rate": 4.458383069346152e-05, |
| "loss": 0.4337, |
| "num_input_tokens_seen": 404360, |
| "step": 3130 |
| }, |
| { |
| "epoch": 2.927170868347339, |
| "grad_norm": 2.1844277381896973, |
| "learning_rate": 4.4558481260361785e-05, |
| "loss": 0.4136, |
| "num_input_tokens_seen": 404984, |
| "step": 3135 |
| }, |
| { |
| "epoch": 2.931839402427638, |
| "grad_norm": 12.90602970123291, |
| "learning_rate": 4.453307988619997e-05, |
| "loss": 0.8935, |
| "num_input_tokens_seen": 405528, |
| "step": 3140 |
| }, |
| { |
| "epoch": 2.9365079365079367, |
| "grad_norm": 10.970650672912598, |
| "learning_rate": 4.4507626638434006e-05, |
| "loss": 0.3612, |
| "num_input_tokens_seen": 406104, |
| "step": 3145 |
| }, |
| { |
| "epoch": 2.9411764705882355, |
| "grad_norm": 5.149006366729736, |
| "learning_rate": 4.448212158465956e-05, |
| "loss": 0.2854, |
| "num_input_tokens_seen": 406712, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.9458450046685343, |
| "grad_norm": 4.595794200897217, |
| "learning_rate": 4.4456564792609886e-05, |
| "loss": 0.3998, |
| "num_input_tokens_seen": 407336, |
| "step": 3155 |
| }, |
| { |
| "epoch": 2.950513538748833, |
| "grad_norm": 3.2409141063690186, |
| "learning_rate": 4.4430956330155636e-05, |
| "loss": 0.4878, |
| "num_input_tokens_seen": 408008, |
| "step": 3160 |
| }, |
| { |
| "epoch": 2.955182072829132, |
| "grad_norm": 20.001317977905273, |
| "learning_rate": 4.440529626530469e-05, |
| "loss": 0.4073, |
| "num_input_tokens_seen": 408632, |
| "step": 3165 |
| }, |
| { |
| "epoch": 2.9598506069094306, |
| "grad_norm": 5.227000713348389, |
| "learning_rate": 4.4379584666201944e-05, |
| "loss": 0.9487, |
| "num_input_tokens_seen": 409240, |
| "step": 3170 |
| }, |
| { |
| "epoch": 2.9645191409897294, |
| "grad_norm": 8.687257766723633, |
| "learning_rate": 4.43538216011292e-05, |
| "loss": 0.6452, |
| "num_input_tokens_seen": 409912, |
| "step": 3175 |
| }, |
| { |
| "epoch": 2.969187675070028, |
| "grad_norm": 3.1073365211486816, |
| "learning_rate": 4.432800713850488e-05, |
| "loss": 0.6379, |
| "num_input_tokens_seen": 410632, |
| "step": 3180 |
| }, |
| { |
| "epoch": 2.973856209150327, |
| "grad_norm": 6.412373065948486, |
| "learning_rate": 4.430214134688394e-05, |
| "loss": 0.6797, |
| "num_input_tokens_seen": 411224, |
| "step": 3185 |
| }, |
| { |
| "epoch": 2.9785247432306257, |
| "grad_norm": 2.9643750190734863, |
| "learning_rate": 4.427622429495765e-05, |
| "loss": 0.6146, |
| "num_input_tokens_seen": 411848, |
| "step": 3190 |
| }, |
| { |
| "epoch": 2.9831932773109244, |
| "grad_norm": 15.312832832336426, |
| "learning_rate": 4.425025605155337e-05, |
| "loss": 0.48, |
| "num_input_tokens_seen": 412536, |
| "step": 3195 |
| }, |
| { |
| "epoch": 2.987861811391223, |
| "grad_norm": 6.596396446228027, |
| "learning_rate": 4.4224236685634466e-05, |
| "loss": 0.5964, |
| "num_input_tokens_seen": 413144, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.992530345471522, |
| "grad_norm": 3.3958661556243896, |
| "learning_rate": 4.419816626630003e-05, |
| "loss": 0.2579, |
| "num_input_tokens_seen": 413752, |
| "step": 3205 |
| }, |
| { |
| "epoch": 2.9971988795518207, |
| "grad_norm": 5.963001251220703, |
| "learning_rate": 4.417204486278475e-05, |
| "loss": 0.7093, |
| "num_input_tokens_seen": 414488, |
| "step": 3210 |
| }, |
| { |
| "epoch": 3.0018674136321195, |
| "grad_norm": 4.949418067932129, |
| "learning_rate": 4.414587254445869e-05, |
| "loss": 0.5218, |
| "num_input_tokens_seen": 415056, |
| "step": 3215 |
| }, |
| { |
| "epoch": 3.0028011204481793, |
| "eval_loss": 0.6635326147079468, |
| "eval_runtime": 3.8704, |
| "eval_samples_per_second": 61.493, |
| "eval_steps_per_second": 30.746, |
| "num_input_tokens_seen": 415184, |
| "step": 3216 |
| }, |
| { |
| "epoch": 3.0065359477124183, |
| "grad_norm": 4.510260105133057, |
| "learning_rate": 4.411964938082717e-05, |
| "loss": 0.4789, |
| "num_input_tokens_seen": 415712, |
| "step": 3220 |
| }, |
| { |
| "epoch": 3.011204481792717, |
| "grad_norm": 2.7946648597717285, |
| "learning_rate": 4.409337544153049e-05, |
| "loss": 0.4034, |
| "num_input_tokens_seen": 416320, |
| "step": 3225 |
| }, |
| { |
| "epoch": 3.015873015873016, |
| "grad_norm": 1.9573955535888672, |
| "learning_rate": 4.406705079634384e-05, |
| "loss": 0.2833, |
| "num_input_tokens_seen": 416992, |
| "step": 3230 |
| }, |
| { |
| "epoch": 3.0205415499533146, |
| "grad_norm": 4.682679653167725, |
| "learning_rate": 4.404067551517703e-05, |
| "loss": 0.2534, |
| "num_input_tokens_seen": 417728, |
| "step": 3235 |
| }, |
| { |
| "epoch": 3.0252100840336134, |
| "grad_norm": 7.025947570800781, |
| "learning_rate": 4.401424966807438e-05, |
| "loss": 0.558, |
| "num_input_tokens_seen": 418336, |
| "step": 3240 |
| }, |
| { |
| "epoch": 3.029878618113912, |
| "grad_norm": 3.119868278503418, |
| "learning_rate": 4.398777332521444e-05, |
| "loss": 0.4241, |
| "num_input_tokens_seen": 418928, |
| "step": 3245 |
| }, |
| { |
| "epoch": 3.034547152194211, |
| "grad_norm": 16.719175338745117, |
| "learning_rate": 4.3961246556909934e-05, |
| "loss": 0.2493, |
| "num_input_tokens_seen": 419472, |
| "step": 3250 |
| }, |
| { |
| "epoch": 3.0392156862745097, |
| "grad_norm": 1.7161365747451782, |
| "learning_rate": 4.393466943360745e-05, |
| "loss": 0.3051, |
| "num_input_tokens_seen": 420192, |
| "step": 3255 |
| }, |
| { |
| "epoch": 3.0438842203548084, |
| "grad_norm": 3.6444671154022217, |
| "learning_rate": 4.39080420258873e-05, |
| "loss": 0.5382, |
| "num_input_tokens_seen": 420864, |
| "step": 3260 |
| }, |
| { |
| "epoch": 3.048552754435107, |
| "grad_norm": 2.0355281829833984, |
| "learning_rate": 4.388136440446337e-05, |
| "loss": 0.3762, |
| "num_input_tokens_seen": 421408, |
| "step": 3265 |
| }, |
| { |
| "epoch": 3.053221288515406, |
| "grad_norm": 4.12183141708374, |
| "learning_rate": 4.385463664018288e-05, |
| "loss": 0.6386, |
| "num_input_tokens_seen": 422032, |
| "step": 3270 |
| }, |
| { |
| "epoch": 3.0578898225957047, |
| "grad_norm": 6.405667781829834, |
| "learning_rate": 4.382785880402619e-05, |
| "loss": 0.4338, |
| "num_input_tokens_seen": 422704, |
| "step": 3275 |
| }, |
| { |
| "epoch": 3.0625583566760035, |
| "grad_norm": 2.73160982131958, |
| "learning_rate": 4.3801030967106676e-05, |
| "loss": 0.2651, |
| "num_input_tokens_seen": 423328, |
| "step": 3280 |
| }, |
| { |
| "epoch": 3.0672268907563027, |
| "grad_norm": 11.54159164428711, |
| "learning_rate": 4.377415320067048e-05, |
| "loss": 0.2931, |
| "num_input_tokens_seen": 423920, |
| "step": 3285 |
| }, |
| { |
| "epoch": 3.0718954248366015, |
| "grad_norm": 6.4927144050598145, |
| "learning_rate": 4.374722557609633e-05, |
| "loss": 0.3357, |
| "num_input_tokens_seen": 424752, |
| "step": 3290 |
| }, |
| { |
| "epoch": 3.0765639589169003, |
| "grad_norm": 6.1774091720581055, |
| "learning_rate": 4.372024816489537e-05, |
| "loss": 0.4242, |
| "num_input_tokens_seen": 425456, |
| "step": 3295 |
| }, |
| { |
| "epoch": 3.081232492997199, |
| "grad_norm": 3.846402645111084, |
| "learning_rate": 4.3693221038710986e-05, |
| "loss": 0.3312, |
| "num_input_tokens_seen": 426096, |
| "step": 3300 |
| }, |
| { |
| "epoch": 3.085901027077498, |
| "grad_norm": 4.391523361206055, |
| "learning_rate": 4.366614426931855e-05, |
| "loss": 0.4697, |
| "num_input_tokens_seen": 426720, |
| "step": 3305 |
| }, |
| { |
| "epoch": 3.0905695611577966, |
| "grad_norm": 5.942804336547852, |
| "learning_rate": 4.363901792862529e-05, |
| "loss": 0.6869, |
| "num_input_tokens_seen": 427344, |
| "step": 3310 |
| }, |
| { |
| "epoch": 3.0952380952380953, |
| "grad_norm": 7.151164531707764, |
| "learning_rate": 4.361184208867009e-05, |
| "loss": 0.4796, |
| "num_input_tokens_seen": 427904, |
| "step": 3315 |
| }, |
| { |
| "epoch": 3.099906629318394, |
| "grad_norm": 8.739173889160156, |
| "learning_rate": 4.3584616821623267e-05, |
| "loss": 0.3204, |
| "num_input_tokens_seen": 428480, |
| "step": 3320 |
| }, |
| { |
| "epoch": 3.104575163398693, |
| "grad_norm": 1.7101725339889526, |
| "learning_rate": 4.3557342199786414e-05, |
| "loss": 0.5556, |
| "num_input_tokens_seen": 429104, |
| "step": 3325 |
| }, |
| { |
| "epoch": 3.1092436974789917, |
| "grad_norm": 5.500372886657715, |
| "learning_rate": 4.353001829559219e-05, |
| "loss": 0.5862, |
| "num_input_tokens_seen": 429824, |
| "step": 3330 |
| }, |
| { |
| "epoch": 3.1139122315592904, |
| "grad_norm": 4.01257848739624, |
| "learning_rate": 4.350264518160414e-05, |
| "loss": 0.3249, |
| "num_input_tokens_seen": 430528, |
| "step": 3335 |
| }, |
| { |
| "epoch": 3.118580765639589, |
| "grad_norm": 5.113812446594238, |
| "learning_rate": 4.347522293051648e-05, |
| "loss": 0.6022, |
| "num_input_tokens_seen": 431120, |
| "step": 3340 |
| }, |
| { |
| "epoch": 3.123249299719888, |
| "grad_norm": 3.563166618347168, |
| "learning_rate": 4.344775161515393e-05, |
| "loss": 0.2757, |
| "num_input_tokens_seen": 431744, |
| "step": 3345 |
| }, |
| { |
| "epoch": 3.1279178338001867, |
| "grad_norm": 5.309462547302246, |
| "learning_rate": 4.3420231308471496e-05, |
| "loss": 0.4937, |
| "num_input_tokens_seen": 432432, |
| "step": 3350 |
| }, |
| { |
| "epoch": 3.1325863678804855, |
| "grad_norm": 6.111654758453369, |
| "learning_rate": 4.3392662083554316e-05, |
| "loss": 0.301, |
| "num_input_tokens_seen": 433104, |
| "step": 3355 |
| }, |
| { |
| "epoch": 3.1372549019607843, |
| "grad_norm": 3.6707801818847656, |
| "learning_rate": 4.3365044013617406e-05, |
| "loss": 0.3583, |
| "num_input_tokens_seen": 433728, |
| "step": 3360 |
| }, |
| { |
| "epoch": 3.141923436041083, |
| "grad_norm": 2.1480159759521484, |
| "learning_rate": 4.3337377172005524e-05, |
| "loss": 0.5164, |
| "num_input_tokens_seen": 434384, |
| "step": 3365 |
| }, |
| { |
| "epoch": 3.146591970121382, |
| "grad_norm": 1.8540209531784058, |
| "learning_rate": 4.330966163219293e-05, |
| "loss": 0.2326, |
| "num_input_tokens_seen": 435056, |
| "step": 3370 |
| }, |
| { |
| "epoch": 3.1512605042016806, |
| "grad_norm": 13.281586647033691, |
| "learning_rate": 4.328189746778323e-05, |
| "loss": 0.3451, |
| "num_input_tokens_seen": 435680, |
| "step": 3375 |
| }, |
| { |
| "epoch": 3.1559290382819793, |
| "grad_norm": 4.929521083831787, |
| "learning_rate": 4.3254084752509145e-05, |
| "loss": 0.3595, |
| "num_input_tokens_seen": 436336, |
| "step": 3380 |
| }, |
| { |
| "epoch": 3.160597572362278, |
| "grad_norm": 1.3814646005630493, |
| "learning_rate": 4.322622356023235e-05, |
| "loss": 0.4323, |
| "num_input_tokens_seen": 437008, |
| "step": 3385 |
| }, |
| { |
| "epoch": 3.165266106442577, |
| "grad_norm": 11.279054641723633, |
| "learning_rate": 4.319831396494324e-05, |
| "loss": 0.4234, |
| "num_input_tokens_seen": 437744, |
| "step": 3390 |
| }, |
| { |
| "epoch": 3.1699346405228757, |
| "grad_norm": 9.408576965332031, |
| "learning_rate": 4.317035604076076e-05, |
| "loss": 0.5266, |
| "num_input_tokens_seen": 438384, |
| "step": 3395 |
| }, |
| { |
| "epoch": 3.1746031746031744, |
| "grad_norm": 4.971055507659912, |
| "learning_rate": 4.3142349861932205e-05, |
| "loss": 0.237, |
| "num_input_tokens_seen": 439072, |
| "step": 3400 |
| }, |
| { |
| "epoch": 3.179271708683473, |
| "grad_norm": 13.090262413024902, |
| "learning_rate": 4.3114295502833026e-05, |
| "loss": 0.4381, |
| "num_input_tokens_seen": 439680, |
| "step": 3405 |
| }, |
| { |
| "epoch": 3.1839402427637724, |
| "grad_norm": 3.374798059463501, |
| "learning_rate": 4.3086193037966593e-05, |
| "loss": 0.8413, |
| "num_input_tokens_seen": 440320, |
| "step": 3410 |
| }, |
| { |
| "epoch": 3.188608776844071, |
| "grad_norm": 2.8687775135040283, |
| "learning_rate": 4.305804254196407e-05, |
| "loss": 0.3212, |
| "num_input_tokens_seen": 440944, |
| "step": 3415 |
| }, |
| { |
| "epoch": 3.19327731092437, |
| "grad_norm": 7.667285919189453, |
| "learning_rate": 4.302984408958416e-05, |
| "loss": 0.4842, |
| "num_input_tokens_seen": 441520, |
| "step": 3420 |
| }, |
| { |
| "epoch": 3.1979458450046687, |
| "grad_norm": 0.6867608428001404, |
| "learning_rate": 4.3001597755712906e-05, |
| "loss": 0.3884, |
| "num_input_tokens_seen": 442176, |
| "step": 3425 |
| }, |
| { |
| "epoch": 3.2026143790849675, |
| "grad_norm": 5.781373977661133, |
| "learning_rate": 4.297330361536354e-05, |
| "loss": 0.3297, |
| "num_input_tokens_seen": 442864, |
| "step": 3430 |
| }, |
| { |
| "epoch": 3.2072829131652663, |
| "grad_norm": 3.4922213554382324, |
| "learning_rate": 4.294496174367623e-05, |
| "loss": 0.4167, |
| "num_input_tokens_seen": 443504, |
| "step": 3435 |
| }, |
| { |
| "epoch": 3.211951447245565, |
| "grad_norm": 6.011383056640625, |
| "learning_rate": 4.2916572215917906e-05, |
| "loss": 0.4975, |
| "num_input_tokens_seen": 444160, |
| "step": 3440 |
| }, |
| { |
| "epoch": 3.216619981325864, |
| "grad_norm": 4.055123805999756, |
| "learning_rate": 4.2888135107482067e-05, |
| "loss": 0.7297, |
| "num_input_tokens_seen": 444768, |
| "step": 3445 |
| }, |
| { |
| "epoch": 3.2212885154061626, |
| "grad_norm": 14.692686080932617, |
| "learning_rate": 4.2859650493888556e-05, |
| "loss": 0.7433, |
| "num_input_tokens_seen": 445424, |
| "step": 3450 |
| }, |
| { |
| "epoch": 3.2259570494864613, |
| "grad_norm": 14.178847312927246, |
| "learning_rate": 4.283111845078339e-05, |
| "loss": 0.5043, |
| "num_input_tokens_seen": 446160, |
| "step": 3455 |
| }, |
| { |
| "epoch": 3.23062558356676, |
| "grad_norm": 8.43553638458252, |
| "learning_rate": 4.280253905393855e-05, |
| "loss": 0.5195, |
| "num_input_tokens_seen": 446768, |
| "step": 3460 |
| }, |
| { |
| "epoch": 3.235294117647059, |
| "grad_norm": 7.715728759765625, |
| "learning_rate": 4.277391237925174e-05, |
| "loss": 0.4626, |
| "num_input_tokens_seen": 447376, |
| "step": 3465 |
| }, |
| { |
| "epoch": 3.2399626517273576, |
| "grad_norm": 3.9560585021972656, |
| "learning_rate": 4.274523850274625e-05, |
| "loss": 0.4005, |
| "num_input_tokens_seen": 447952, |
| "step": 3470 |
| }, |
| { |
| "epoch": 3.2446311858076564, |
| "grad_norm": 3.0296170711517334, |
| "learning_rate": 4.2716517500570705e-05, |
| "loss": 0.3344, |
| "num_input_tokens_seen": 448624, |
| "step": 3475 |
| }, |
| { |
| "epoch": 3.249299719887955, |
| "grad_norm": 6.199625492095947, |
| "learning_rate": 4.2687749448998906e-05, |
| "loss": 0.5182, |
| "num_input_tokens_seen": 449184, |
| "step": 3480 |
| }, |
| { |
| "epoch": 3.253968253968254, |
| "grad_norm": 7.57280969619751, |
| "learning_rate": 4.265893442442957e-05, |
| "loss": 0.7696, |
| "num_input_tokens_seen": 449856, |
| "step": 3485 |
| }, |
| { |
| "epoch": 3.2586367880485527, |
| "grad_norm": 5.486474990844727, |
| "learning_rate": 4.2630072503386165e-05, |
| "loss": 0.3264, |
| "num_input_tokens_seen": 450464, |
| "step": 3490 |
| }, |
| { |
| "epoch": 3.2633053221288515, |
| "grad_norm": 6.274860382080078, |
| "learning_rate": 4.260116376251672e-05, |
| "loss": 0.6919, |
| "num_input_tokens_seen": 451104, |
| "step": 3495 |
| }, |
| { |
| "epoch": 3.2679738562091503, |
| "grad_norm": 2.639403820037842, |
| "learning_rate": 4.2572208278593596e-05, |
| "loss": 0.3676, |
| "num_input_tokens_seen": 451840, |
| "step": 3500 |
| }, |
| { |
| "epoch": 3.272642390289449, |
| "grad_norm": 8.113277435302734, |
| "learning_rate": 4.254320612851328e-05, |
| "loss": 0.5073, |
| "num_input_tokens_seen": 452528, |
| "step": 3505 |
| }, |
| { |
| "epoch": 3.277310924369748, |
| "grad_norm": 2.9319417476654053, |
| "learning_rate": 4.2514157389296196e-05, |
| "loss": 0.1423, |
| "num_input_tokens_seen": 453136, |
| "step": 3510 |
| }, |
| { |
| "epoch": 3.2819794584500466, |
| "grad_norm": 2.199583053588867, |
| "learning_rate": 4.248506213808648e-05, |
| "loss": 0.2884, |
| "num_input_tokens_seen": 453728, |
| "step": 3515 |
| }, |
| { |
| "epoch": 3.2866479925303453, |
| "grad_norm": 5.822009563446045, |
| "learning_rate": 4.245592045215182e-05, |
| "loss": 0.1555, |
| "num_input_tokens_seen": 454384, |
| "step": 3520 |
| }, |
| { |
| "epoch": 3.291316526610644, |
| "grad_norm": 2.6942152976989746, |
| "learning_rate": 4.242673240888319e-05, |
| "loss": 0.2941, |
| "num_input_tokens_seen": 454960, |
| "step": 3525 |
| }, |
| { |
| "epoch": 3.295985060690943, |
| "grad_norm": 16.42827606201172, |
| "learning_rate": 4.239749808579468e-05, |
| "loss": 0.5326, |
| "num_input_tokens_seen": 455648, |
| "step": 3530 |
| }, |
| { |
| "epoch": 3.3006535947712417, |
| "grad_norm": 6.735487937927246, |
| "learning_rate": 4.2368217560523306e-05, |
| "loss": 0.3945, |
| "num_input_tokens_seen": 456320, |
| "step": 3535 |
| }, |
| { |
| "epoch": 3.3053221288515404, |
| "grad_norm": 4.459935188293457, |
| "learning_rate": 4.233889091082874e-05, |
| "loss": 0.1235, |
| "num_input_tokens_seen": 456912, |
| "step": 3540 |
| }, |
| { |
| "epoch": 3.309990662931839, |
| "grad_norm": 7.533689975738525, |
| "learning_rate": 4.230951821459319e-05, |
| "loss": 0.392, |
| "num_input_tokens_seen": 457568, |
| "step": 3545 |
| }, |
| { |
| "epoch": 3.314659197012138, |
| "grad_norm": 5.718842029571533, |
| "learning_rate": 4.228009954982112e-05, |
| "loss": 0.2334, |
| "num_input_tokens_seen": 458416, |
| "step": 3550 |
| }, |
| { |
| "epoch": 3.3193277310924367, |
| "grad_norm": 6.39036750793457, |
| "learning_rate": 4.2250634994639095e-05, |
| "loss": 0.5383, |
| "num_input_tokens_seen": 459136, |
| "step": 3555 |
| }, |
| { |
| "epoch": 3.323996265172736, |
| "grad_norm": 4.211606979370117, |
| "learning_rate": 4.222112462729552e-05, |
| "loss": 0.3948, |
| "num_input_tokens_seen": 459824, |
| "step": 3560 |
| }, |
| { |
| "epoch": 3.3286647992530347, |
| "grad_norm": 4.4313273429870605, |
| "learning_rate": 4.2191568526160485e-05, |
| "loss": 0.3832, |
| "num_input_tokens_seen": 460544, |
| "step": 3565 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 4.849913597106934, |
| "learning_rate": 4.216196676972553e-05, |
| "loss": 0.6303, |
| "num_input_tokens_seen": 461120, |
| "step": 3570 |
| }, |
| { |
| "epoch": 3.3380018674136323, |
| "grad_norm": 6.789108753204346, |
| "learning_rate": 4.213231943660344e-05, |
| "loss": 0.5102, |
| "num_input_tokens_seen": 461712, |
| "step": 3575 |
| }, |
| { |
| "epoch": 3.342670401493931, |
| "grad_norm": 5.179975986480713, |
| "learning_rate": 4.210262660552804e-05, |
| "loss": 0.5964, |
| "num_input_tokens_seen": 462368, |
| "step": 3580 |
| }, |
| { |
| "epoch": 3.34733893557423, |
| "grad_norm": 2.2866899967193604, |
| "learning_rate": 4.2072888355353966e-05, |
| "loss": 0.3182, |
| "num_input_tokens_seen": 463088, |
| "step": 3585 |
| }, |
| { |
| "epoch": 3.3520074696545286, |
| "grad_norm": 14.321250915527344, |
| "learning_rate": 4.2043104765056504e-05, |
| "loss": 0.3753, |
| "num_input_tokens_seen": 463648, |
| "step": 3590 |
| }, |
| { |
| "epoch": 3.3566760037348273, |
| "grad_norm": 2.1171340942382812, |
| "learning_rate": 4.2013275913731315e-05, |
| "loss": 0.3841, |
| "num_input_tokens_seen": 464288, |
| "step": 3595 |
| }, |
| { |
| "epoch": 3.361344537815126, |
| "grad_norm": 4.470531940460205, |
| "learning_rate": 4.198340188059429e-05, |
| "loss": 0.3792, |
| "num_input_tokens_seen": 464992, |
| "step": 3600 |
| }, |
| { |
| "epoch": 3.366013071895425, |
| "grad_norm": 2.543765068054199, |
| "learning_rate": 4.1953482744981274e-05, |
| "loss": 0.2807, |
| "num_input_tokens_seen": 465712, |
| "step": 3605 |
| }, |
| { |
| "epoch": 3.3706816059757236, |
| "grad_norm": 3.711956262588501, |
| "learning_rate": 4.192351858634792e-05, |
| "loss": 0.3673, |
| "num_input_tokens_seen": 466320, |
| "step": 3610 |
| }, |
| { |
| "epoch": 3.3753501400560224, |
| "grad_norm": 6.865347385406494, |
| "learning_rate": 4.1893509484269443e-05, |
| "loss": 0.505, |
| "num_input_tokens_seen": 466944, |
| "step": 3615 |
| }, |
| { |
| "epoch": 3.380018674136321, |
| "grad_norm": 4.681259632110596, |
| "learning_rate": 4.186345551844039e-05, |
| "loss": 0.4728, |
| "num_input_tokens_seen": 467616, |
| "step": 3620 |
| }, |
| { |
| "epoch": 3.38468720821662, |
| "grad_norm": 4.106618881225586, |
| "learning_rate": 4.183335676867448e-05, |
| "loss": 0.3185, |
| "num_input_tokens_seen": 468208, |
| "step": 3625 |
| }, |
| { |
| "epoch": 3.3893557422969187, |
| "grad_norm": 1.4591264724731445, |
| "learning_rate": 4.180321331490436e-05, |
| "loss": 0.1859, |
| "num_input_tokens_seen": 468912, |
| "step": 3630 |
| }, |
| { |
| "epoch": 3.3940242763772175, |
| "grad_norm": 4.069035053253174, |
| "learning_rate": 4.1773025237181365e-05, |
| "loss": 0.4485, |
| "num_input_tokens_seen": 469536, |
| "step": 3635 |
| }, |
| { |
| "epoch": 3.3986928104575163, |
| "grad_norm": 3.73283314704895, |
| "learning_rate": 4.1742792615675385e-05, |
| "loss": 0.4666, |
| "num_input_tokens_seen": 470112, |
| "step": 3640 |
| }, |
| { |
| "epoch": 3.403361344537815, |
| "grad_norm": 12.443367004394531, |
| "learning_rate": 4.171251553067457e-05, |
| "loss": 0.4469, |
| "num_input_tokens_seen": 470784, |
| "step": 3645 |
| }, |
| { |
| "epoch": 3.408029878618114, |
| "grad_norm": 3.0841293334960938, |
| "learning_rate": 4.168219406258515e-05, |
| "loss": 0.493, |
| "num_input_tokens_seen": 471456, |
| "step": 3650 |
| }, |
| { |
| "epoch": 3.4126984126984126, |
| "grad_norm": 4.980912685394287, |
| "learning_rate": 4.1651828291931264e-05, |
| "loss": 0.3285, |
| "num_input_tokens_seen": 472144, |
| "step": 3655 |
| }, |
| { |
| "epoch": 3.4173669467787113, |
| "grad_norm": 3.9507060050964355, |
| "learning_rate": 4.1621418299354634e-05, |
| "loss": 0.5102, |
| "num_input_tokens_seen": 472848, |
| "step": 3660 |
| }, |
| { |
| "epoch": 3.42203548085901, |
| "grad_norm": 3.0940191745758057, |
| "learning_rate": 4.159096416561449e-05, |
| "loss": 0.5427, |
| "num_input_tokens_seen": 473392, |
| "step": 3665 |
| }, |
| { |
| "epoch": 3.426704014939309, |
| "grad_norm": 4.08584451675415, |
| "learning_rate": 4.156046597158724e-05, |
| "loss": 0.2202, |
| "num_input_tokens_seen": 474032, |
| "step": 3670 |
| }, |
| { |
| "epoch": 3.431372549019608, |
| "grad_norm": 9.69025707244873, |
| "learning_rate": 4.1529923798266326e-05, |
| "loss": 0.4965, |
| "num_input_tokens_seen": 474640, |
| "step": 3675 |
| }, |
| { |
| "epoch": 3.436041083099907, |
| "grad_norm": 3.4433095455169678, |
| "learning_rate": 4.149933772676198e-05, |
| "loss": 0.5427, |
| "num_input_tokens_seen": 475232, |
| "step": 3680 |
| }, |
| { |
| "epoch": 3.4407096171802056, |
| "grad_norm": 3.5992636680603027, |
| "learning_rate": 4.146870783830101e-05, |
| "loss": 0.4117, |
| "num_input_tokens_seen": 475824, |
| "step": 3685 |
| }, |
| { |
| "epoch": 3.4453781512605044, |
| "grad_norm": 2.926135778427124, |
| "learning_rate": 4.14380342142266e-05, |
| "loss": 0.4184, |
| "num_input_tokens_seen": 476448, |
| "step": 3690 |
| }, |
| { |
| "epoch": 3.450046685340803, |
| "grad_norm": 5.663197040557861, |
| "learning_rate": 4.140731693599805e-05, |
| "loss": 0.3798, |
| "num_input_tokens_seen": 477024, |
| "step": 3695 |
| }, |
| { |
| "epoch": 3.454715219421102, |
| "grad_norm": 4.990402698516846, |
| "learning_rate": 4.137655608519063e-05, |
| "loss": 0.3333, |
| "num_input_tokens_seen": 477664, |
| "step": 3700 |
| }, |
| { |
| "epoch": 3.4593837535014007, |
| "grad_norm": 5.383077621459961, |
| "learning_rate": 4.13457517434953e-05, |
| "loss": 0.6738, |
| "num_input_tokens_seen": 478272, |
| "step": 3705 |
| }, |
| { |
| "epoch": 3.4640522875816995, |
| "grad_norm": 4.2853803634643555, |
| "learning_rate": 4.131490399271852e-05, |
| "loss": 0.6813, |
| "num_input_tokens_seen": 478816, |
| "step": 3710 |
| }, |
| { |
| "epoch": 3.4687208216619982, |
| "grad_norm": 9.064924240112305, |
| "learning_rate": 4.128401291478206e-05, |
| "loss": 0.5682, |
| "num_input_tokens_seen": 479440, |
| "step": 3715 |
| }, |
| { |
| "epoch": 3.473389355742297, |
| "grad_norm": 2.432324171066284, |
| "learning_rate": 4.12530785917227e-05, |
| "loss": 0.3548, |
| "num_input_tokens_seen": 480080, |
| "step": 3720 |
| }, |
| { |
| "epoch": 3.478057889822596, |
| "grad_norm": 2.6238317489624023, |
| "learning_rate": 4.1222101105692116e-05, |
| "loss": 0.7767, |
| "num_input_tokens_seen": 480768, |
| "step": 3725 |
| }, |
| { |
| "epoch": 3.4827264239028946, |
| "grad_norm": 5.692093849182129, |
| "learning_rate": 4.1191080538956586e-05, |
| "loss": 0.4093, |
| "num_input_tokens_seen": 481392, |
| "step": 3730 |
| }, |
| { |
| "epoch": 3.4873949579831933, |
| "grad_norm": 5.617989540100098, |
| "learning_rate": 4.116001697389678e-05, |
| "loss": 0.3229, |
| "num_input_tokens_seen": 482064, |
| "step": 3735 |
| }, |
| { |
| "epoch": 3.492063492063492, |
| "grad_norm": 11.547221183776855, |
| "learning_rate": 4.11289104930076e-05, |
| "loss": 0.4888, |
| "num_input_tokens_seen": 482800, |
| "step": 3740 |
| }, |
| { |
| "epoch": 3.496732026143791, |
| "grad_norm": 3.141573429107666, |
| "learning_rate": 4.109776117889789e-05, |
| "loss": 0.2159, |
| "num_input_tokens_seen": 483472, |
| "step": 3745 |
| }, |
| { |
| "epoch": 3.5014005602240896, |
| "grad_norm": 14.447588920593262, |
| "learning_rate": 4.1066569114290257e-05, |
| "loss": 0.3896, |
| "num_input_tokens_seen": 484320, |
| "step": 3750 |
| }, |
| { |
| "epoch": 3.503267973856209, |
| "eval_loss": 0.6631819009780884, |
| "eval_runtime": 3.8657, |
| "eval_samples_per_second": 61.568, |
| "eval_steps_per_second": 30.784, |
| "num_input_tokens_seen": 484576, |
| "step": 3752 |
| }, |
| { |
| "epoch": 3.5060690943043884, |
| "grad_norm": 6.598292827606201, |
| "learning_rate": 4.103533438202082e-05, |
| "loss": 0.4745, |
| "num_input_tokens_seen": 484976, |
| "step": 3755 |
| }, |
| { |
| "epoch": 3.510737628384687, |
| "grad_norm": 2.7642271518707275, |
| "learning_rate": 4.100405706503904e-05, |
| "loss": 0.317, |
| "num_input_tokens_seen": 485568, |
| "step": 3760 |
| }, |
| { |
| "epoch": 3.515406162464986, |
| "grad_norm": 4.99310302734375, |
| "learning_rate": 4.0972737246407444e-05, |
| "loss": 0.5242, |
| "num_input_tokens_seen": 486256, |
| "step": 3765 |
| }, |
| { |
| "epoch": 3.5200746965452847, |
| "grad_norm": 12.541024208068848, |
| "learning_rate": 4.0941375009301444e-05, |
| "loss": 0.7703, |
| "num_input_tokens_seen": 486864, |
| "step": 3770 |
| }, |
| { |
| "epoch": 3.5247432306255835, |
| "grad_norm": 10.067193031311035, |
| "learning_rate": 4.0909970437009096e-05, |
| "loss": 0.635, |
| "num_input_tokens_seen": 487568, |
| "step": 3775 |
| }, |
| { |
| "epoch": 3.5294117647058822, |
| "grad_norm": 2.522975444793701, |
| "learning_rate": 4.087852361293088e-05, |
| "loss": 0.3526, |
| "num_input_tokens_seen": 488256, |
| "step": 3780 |
| }, |
| { |
| "epoch": 3.534080298786181, |
| "grad_norm": 4.470706939697266, |
| "learning_rate": 4.084703462057949e-05, |
| "loss": 0.3543, |
| "num_input_tokens_seen": 488880, |
| "step": 3785 |
| }, |
| { |
| "epoch": 3.53874883286648, |
| "grad_norm": 1.5186313390731812, |
| "learning_rate": 4.081550354357962e-05, |
| "loss": 0.2626, |
| "num_input_tokens_seen": 489712, |
| "step": 3790 |
| }, |
| { |
| "epoch": 3.5434173669467786, |
| "grad_norm": 8.709242820739746, |
| "learning_rate": 4.078393046566769e-05, |
| "loss": 0.5864, |
| "num_input_tokens_seen": 490352, |
| "step": 3795 |
| }, |
| { |
| "epoch": 3.5480859010270773, |
| "grad_norm": 8.166577339172363, |
| "learning_rate": 4.0752315470691696e-05, |
| "loss": 0.6285, |
| "num_input_tokens_seen": 490992, |
| "step": 3800 |
| }, |
| { |
| "epoch": 3.552754435107376, |
| "grad_norm": 1.631000280380249, |
| "learning_rate": 4.0720658642610934e-05, |
| "loss": 0.2448, |
| "num_input_tokens_seen": 491712, |
| "step": 3805 |
| }, |
| { |
| "epoch": 3.557422969187675, |
| "grad_norm": 7.412605285644531, |
| "learning_rate": 4.068896006549579e-05, |
| "loss": 0.5105, |
| "num_input_tokens_seen": 492352, |
| "step": 3810 |
| }, |
| { |
| "epoch": 3.5620915032679736, |
| "grad_norm": 2.645429849624634, |
| "learning_rate": 4.0657219823527566e-05, |
| "loss": 0.6244, |
| "num_input_tokens_seen": 492976, |
| "step": 3815 |
| }, |
| { |
| "epoch": 3.5667600373482724, |
| "grad_norm": 5.971198558807373, |
| "learning_rate": 4.0625438000998153e-05, |
| "loss": 0.2768, |
| "num_input_tokens_seen": 493712, |
| "step": 3820 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 3.65240216255188, |
| "learning_rate": 4.059361468230989e-05, |
| "loss": 0.5011, |
| "num_input_tokens_seen": 494320, |
| "step": 3825 |
| }, |
| { |
| "epoch": 3.57609710550887, |
| "grad_norm": 2.6212239265441895, |
| "learning_rate": 4.0561749951975324e-05, |
| "loss": 0.5193, |
| "num_input_tokens_seen": 494960, |
| "step": 3830 |
| }, |
| { |
| "epoch": 3.580765639589169, |
| "grad_norm": 6.561978340148926, |
| "learning_rate": 4.052984389461698e-05, |
| "loss": 0.2862, |
| "num_input_tokens_seen": 495664, |
| "step": 3835 |
| }, |
| { |
| "epoch": 3.585434173669468, |
| "grad_norm": 3.486088275909424, |
| "learning_rate": 4.049789659496712e-05, |
| "loss": 0.3949, |
| "num_input_tokens_seen": 496320, |
| "step": 3840 |
| }, |
| { |
| "epoch": 3.5901027077497667, |
| "grad_norm": 8.017462730407715, |
| "learning_rate": 4.0465908137867545e-05, |
| "loss": 0.5328, |
| "num_input_tokens_seen": 496992, |
| "step": 3845 |
| }, |
| { |
| "epoch": 3.5947712418300655, |
| "grad_norm": 5.656188488006592, |
| "learning_rate": 4.043387860826936e-05, |
| "loss": 0.5002, |
| "num_input_tokens_seen": 497600, |
| "step": 3850 |
| }, |
| { |
| "epoch": 3.5994397759103642, |
| "grad_norm": 0.9717182517051697, |
| "learning_rate": 4.040180809123272e-05, |
| "loss": 0.564, |
| "num_input_tokens_seen": 498192, |
| "step": 3855 |
| }, |
| { |
| "epoch": 3.604108309990663, |
| "grad_norm": 8.40697956085205, |
| "learning_rate": 4.036969667192665e-05, |
| "loss": 0.2135, |
| "num_input_tokens_seen": 498928, |
| "step": 3860 |
| }, |
| { |
| "epoch": 3.6087768440709618, |
| "grad_norm": 1.4905232191085815, |
| "learning_rate": 4.03375444356288e-05, |
| "loss": 0.1885, |
| "num_input_tokens_seen": 499648, |
| "step": 3865 |
| }, |
| { |
| "epoch": 3.6134453781512605, |
| "grad_norm": 5.896583557128906, |
| "learning_rate": 4.030535146772521e-05, |
| "loss": 1.0008, |
| "num_input_tokens_seen": 500224, |
| "step": 3870 |
| }, |
| { |
| "epoch": 3.6181139122315593, |
| "grad_norm": 4.838075637817383, |
| "learning_rate": 4.027311785371009e-05, |
| "loss": 0.2981, |
| "num_input_tokens_seen": 500832, |
| "step": 3875 |
| }, |
| { |
| "epoch": 3.622782446311858, |
| "grad_norm": 4.020202159881592, |
| "learning_rate": 4.0240843679185603e-05, |
| "loss": 0.6731, |
| "num_input_tokens_seen": 501440, |
| "step": 3880 |
| }, |
| { |
| "epoch": 3.627450980392157, |
| "grad_norm": 4.53800630569458, |
| "learning_rate": 4.020852902986162e-05, |
| "loss": 0.3302, |
| "num_input_tokens_seen": 501968, |
| "step": 3885 |
| }, |
| { |
| "epoch": 3.6321195144724556, |
| "grad_norm": 4.003100872039795, |
| "learning_rate": 4.017617399155548e-05, |
| "loss": 0.5148, |
| "num_input_tokens_seen": 502576, |
| "step": 3890 |
| }, |
| { |
| "epoch": 3.6367880485527544, |
| "grad_norm": 5.846912860870361, |
| "learning_rate": 4.0143778650191835e-05, |
| "loss": 0.3139, |
| "num_input_tokens_seen": 503280, |
| "step": 3895 |
| }, |
| { |
| "epoch": 3.641456582633053, |
| "grad_norm": 5.185009956359863, |
| "learning_rate": 4.01113430918023e-05, |
| "loss": 0.4181, |
| "num_input_tokens_seen": 503920, |
| "step": 3900 |
| }, |
| { |
| "epoch": 3.646125116713352, |
| "grad_norm": 3.249129295349121, |
| "learning_rate": 4.0078867402525354e-05, |
| "loss": 0.3392, |
| "num_input_tokens_seen": 504608, |
| "step": 3905 |
| }, |
| { |
| "epoch": 3.6507936507936507, |
| "grad_norm": 3.677781343460083, |
| "learning_rate": 4.004635166860602e-05, |
| "loss": 0.3489, |
| "num_input_tokens_seen": 505152, |
| "step": 3910 |
| }, |
| { |
| "epoch": 3.6554621848739495, |
| "grad_norm": 1.4997936487197876, |
| "learning_rate": 4.0013795976395674e-05, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 505856, |
| "step": 3915 |
| }, |
| { |
| "epoch": 3.6601307189542482, |
| "grad_norm": 2.245586633682251, |
| "learning_rate": 3.9981200412351816e-05, |
| "loss": 0.332, |
| "num_input_tokens_seen": 506480, |
| "step": 3920 |
| }, |
| { |
| "epoch": 3.664799253034547, |
| "grad_norm": 7.378620147705078, |
| "learning_rate": 3.99485650630378e-05, |
| "loss": 0.5586, |
| "num_input_tokens_seen": 507040, |
| "step": 3925 |
| }, |
| { |
| "epoch": 3.669467787114846, |
| "grad_norm": 43.491329193115234, |
| "learning_rate": 3.9915890015122683e-05, |
| "loss": 0.3228, |
| "num_input_tokens_seen": 507632, |
| "step": 3930 |
| }, |
| { |
| "epoch": 3.674136321195145, |
| "grad_norm": 4.064418315887451, |
| "learning_rate": 3.988317535538092e-05, |
| "loss": 0.3529, |
| "num_input_tokens_seen": 508208, |
| "step": 3935 |
| }, |
| { |
| "epoch": 3.6788048552754438, |
| "grad_norm": 7.0785627365112305, |
| "learning_rate": 3.985042117069217e-05, |
| "loss": 0.85, |
| "num_input_tokens_seen": 508832, |
| "step": 3940 |
| }, |
| { |
| "epoch": 3.6834733893557425, |
| "grad_norm": 1.5454710721969604, |
| "learning_rate": 3.981762754804107e-05, |
| "loss": 0.4017, |
| "num_input_tokens_seen": 509488, |
| "step": 3945 |
| }, |
| { |
| "epoch": 3.6881419234360413, |
| "grad_norm": 6.187710762023926, |
| "learning_rate": 3.9784794574516945e-05, |
| "loss": 0.4936, |
| "num_input_tokens_seen": 510112, |
| "step": 3950 |
| }, |
| { |
| "epoch": 3.69281045751634, |
| "grad_norm": 4.999000549316406, |
| "learning_rate": 3.975192233731369e-05, |
| "loss": 0.4316, |
| "num_input_tokens_seen": 510720, |
| "step": 3955 |
| }, |
| { |
| "epoch": 3.697478991596639, |
| "grad_norm": 2.906360149383545, |
| "learning_rate": 3.971901092372942e-05, |
| "loss": 0.3729, |
| "num_input_tokens_seen": 511344, |
| "step": 3960 |
| }, |
| { |
| "epoch": 3.7021475256769376, |
| "grad_norm": 2.675576686859131, |
| "learning_rate": 3.968606042116632e-05, |
| "loss": 0.483, |
| "num_input_tokens_seen": 512016, |
| "step": 3965 |
| }, |
| { |
| "epoch": 3.7068160597572364, |
| "grad_norm": 6.858824729919434, |
| "learning_rate": 3.965307091713037e-05, |
| "loss": 0.5243, |
| "num_input_tokens_seen": 512592, |
| "step": 3970 |
| }, |
| { |
| "epoch": 3.711484593837535, |
| "grad_norm": 14.2874174118042, |
| "learning_rate": 3.962004249923112e-05, |
| "loss": 0.5098, |
| "num_input_tokens_seen": 513248, |
| "step": 3975 |
| }, |
| { |
| "epoch": 3.716153127917834, |
| "grad_norm": 8.637978553771973, |
| "learning_rate": 3.958697525518148e-05, |
| "loss": 0.3194, |
| "num_input_tokens_seen": 513888, |
| "step": 3980 |
| }, |
| { |
| "epoch": 3.7208216619981327, |
| "grad_norm": 4.814327716827393, |
| "learning_rate": 3.955386927279744e-05, |
| "loss": 0.4633, |
| "num_input_tokens_seen": 514480, |
| "step": 3985 |
| }, |
| { |
| "epoch": 3.7254901960784315, |
| "grad_norm": 5.299570560455322, |
| "learning_rate": 3.952072463999791e-05, |
| "loss": 0.3609, |
| "num_input_tokens_seen": 515072, |
| "step": 3990 |
| }, |
| { |
| "epoch": 3.7301587301587302, |
| "grad_norm": 4.821151256561279, |
| "learning_rate": 3.94875414448044e-05, |
| "loss": 0.6235, |
| "num_input_tokens_seen": 515712, |
| "step": 3995 |
| }, |
| { |
| "epoch": 3.734827264239029, |
| "grad_norm": 4.8716254234313965, |
| "learning_rate": 3.945431977534086e-05, |
| "loss": 0.574, |
| "num_input_tokens_seen": 516464, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.7394957983193278, |
| "grad_norm": 13.349778175354004, |
| "learning_rate": 3.942105971983341e-05, |
| "loss": 0.4969, |
| "num_input_tokens_seen": 517168, |
| "step": 4005 |
| }, |
| { |
| "epoch": 3.7441643323996265, |
| "grad_norm": 4.25555419921875, |
| "learning_rate": 3.938776136661008e-05, |
| "loss": 0.6308, |
| "num_input_tokens_seen": 517792, |
| "step": 4010 |
| }, |
| { |
| "epoch": 3.7488328664799253, |
| "grad_norm": 2.265429735183716, |
| "learning_rate": 3.935442480410065e-05, |
| "loss": 0.5301, |
| "num_input_tokens_seen": 518480, |
| "step": 4015 |
| }, |
| { |
| "epoch": 3.753501400560224, |
| "grad_norm": 4.872378826141357, |
| "learning_rate": 3.932105012083637e-05, |
| "loss": 0.3686, |
| "num_input_tokens_seen": 519136, |
| "step": 4020 |
| }, |
| { |
| "epoch": 3.758169934640523, |
| "grad_norm": 2.909376382827759, |
| "learning_rate": 3.928763740544967e-05, |
| "loss": 0.3435, |
| "num_input_tokens_seen": 519696, |
| "step": 4025 |
| }, |
| { |
| "epoch": 3.7628384687208216, |
| "grad_norm": 2.4545042514801025, |
| "learning_rate": 3.925418674667405e-05, |
| "loss": 0.1534, |
| "num_input_tokens_seen": 520304, |
| "step": 4030 |
| }, |
| { |
| "epoch": 3.7675070028011204, |
| "grad_norm": 5.603614330291748, |
| "learning_rate": 3.922069823334373e-05, |
| "loss": 0.3479, |
| "num_input_tokens_seen": 520944, |
| "step": 4035 |
| }, |
| { |
| "epoch": 3.772175536881419, |
| "grad_norm": 5.503204822540283, |
| "learning_rate": 3.918717195439349e-05, |
| "loss": 0.6142, |
| "num_input_tokens_seen": 521584, |
| "step": 4040 |
| }, |
| { |
| "epoch": 3.776844070961718, |
| "grad_norm": 9.066848754882812, |
| "learning_rate": 3.915360799885837e-05, |
| "loss": 0.4284, |
| "num_input_tokens_seen": 522208, |
| "step": 4045 |
| }, |
| { |
| "epoch": 3.7815126050420167, |
| "grad_norm": 6.612861156463623, |
| "learning_rate": 3.9120006455873506e-05, |
| "loss": 0.417, |
| "num_input_tokens_seen": 522880, |
| "step": 4050 |
| }, |
| { |
| "epoch": 3.7861811391223155, |
| "grad_norm": 3.602097511291504, |
| "learning_rate": 3.908636741467382e-05, |
| "loss": 0.3534, |
| "num_input_tokens_seen": 523520, |
| "step": 4055 |
| }, |
| { |
| "epoch": 3.7908496732026142, |
| "grad_norm": 1.3662033081054688, |
| "learning_rate": 3.905269096459384e-05, |
| "loss": 0.2182, |
| "num_input_tokens_seen": 524208, |
| "step": 4060 |
| }, |
| { |
| "epoch": 3.795518207282913, |
| "grad_norm": 9.043498992919922, |
| "learning_rate": 3.901897719506743e-05, |
| "loss": 0.2156, |
| "num_input_tokens_seen": 524912, |
| "step": 4065 |
| }, |
| { |
| "epoch": 3.8001867413632118, |
| "grad_norm": 6.337942600250244, |
| "learning_rate": 3.8985226195627563e-05, |
| "loss": 0.3852, |
| "num_input_tokens_seen": 525488, |
| "step": 4070 |
| }, |
| { |
| "epoch": 3.8048552754435105, |
| "grad_norm": 0.5356036424636841, |
| "learning_rate": 3.8951438055906084e-05, |
| "loss": 0.3144, |
| "num_input_tokens_seen": 526096, |
| "step": 4075 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 7.135247707366943, |
| "learning_rate": 3.891761286563347e-05, |
| "loss": 0.8808, |
| "num_input_tokens_seen": 526752, |
| "step": 4080 |
| }, |
| { |
| "epoch": 3.814192343604108, |
| "grad_norm": 1.464228630065918, |
| "learning_rate": 3.88837507146386e-05, |
| "loss": 0.2911, |
| "num_input_tokens_seen": 527440, |
| "step": 4085 |
| }, |
| { |
| "epoch": 3.818860877684407, |
| "grad_norm": 3.206880569458008, |
| "learning_rate": 3.88498516928485e-05, |
| "loss": 0.6713, |
| "num_input_tokens_seen": 528032, |
| "step": 4090 |
| }, |
| { |
| "epoch": 3.8235294117647056, |
| "grad_norm": 5.08414363861084, |
| "learning_rate": 3.881591589028809e-05, |
| "loss": 0.3554, |
| "num_input_tokens_seen": 528688, |
| "step": 4095 |
| }, |
| { |
| "epoch": 3.828197945845005, |
| "grad_norm": 2.6961653232574463, |
| "learning_rate": 3.878194339708002e-05, |
| "loss": 0.2922, |
| "num_input_tokens_seen": 529344, |
| "step": 4100 |
| }, |
| { |
| "epoch": 3.8328664799253036, |
| "grad_norm": 4.350805759429932, |
| "learning_rate": 3.8747934303444344e-05, |
| "loss": 0.4911, |
| "num_input_tokens_seen": 529920, |
| "step": 4105 |
| }, |
| { |
| "epoch": 3.8375350140056024, |
| "grad_norm": 12.087987899780273, |
| "learning_rate": 3.871388869969833e-05, |
| "loss": 1.0404, |
| "num_input_tokens_seen": 530544, |
| "step": 4110 |
| }, |
| { |
| "epoch": 3.842203548085901, |
| "grad_norm": 6.6325883865356445, |
| "learning_rate": 3.867980667625618e-05, |
| "loss": 0.6221, |
| "num_input_tokens_seen": 531232, |
| "step": 4115 |
| }, |
| { |
| "epoch": 3.8468720821662, |
| "grad_norm": 4.015625953674316, |
| "learning_rate": 3.864568832362885e-05, |
| "loss": 0.4101, |
| "num_input_tokens_seen": 531872, |
| "step": 4120 |
| }, |
| { |
| "epoch": 3.8515406162464987, |
| "grad_norm": 4.436892986297607, |
| "learning_rate": 3.861153373242374e-05, |
| "loss": 0.2394, |
| "num_input_tokens_seen": 532576, |
| "step": 4125 |
| }, |
| { |
| "epoch": 3.8562091503267975, |
| "grad_norm": 2.150585651397705, |
| "learning_rate": 3.857734299334452e-05, |
| "loss": 0.3944, |
| "num_input_tokens_seen": 533216, |
| "step": 4130 |
| }, |
| { |
| "epoch": 3.860877684407096, |
| "grad_norm": 2.3524234294891357, |
| "learning_rate": 3.854311619719084e-05, |
| "loss": 0.2307, |
| "num_input_tokens_seen": 533856, |
| "step": 4135 |
| }, |
| { |
| "epoch": 3.865546218487395, |
| "grad_norm": 9.083573341369629, |
| "learning_rate": 3.850885343485811e-05, |
| "loss": 0.5951, |
| "num_input_tokens_seen": 534448, |
| "step": 4140 |
| }, |
| { |
| "epoch": 3.8702147525676938, |
| "grad_norm": 4.580225467681885, |
| "learning_rate": 3.847455479733724e-05, |
| "loss": 0.3928, |
| "num_input_tokens_seen": 535072, |
| "step": 4145 |
| }, |
| { |
| "epoch": 3.8748832866479925, |
| "grad_norm": 7.7241106033325195, |
| "learning_rate": 3.844022037571443e-05, |
| "loss": 0.2974, |
| "num_input_tokens_seen": 535760, |
| "step": 4150 |
| }, |
| { |
| "epoch": 3.8795518207282913, |
| "grad_norm": 6.325448989868164, |
| "learning_rate": 3.840585026117093e-05, |
| "loss": 0.3938, |
| "num_input_tokens_seen": 536368, |
| "step": 4155 |
| }, |
| { |
| "epoch": 3.88422035480859, |
| "grad_norm": 1.6497693061828613, |
| "learning_rate": 3.837144454498272e-05, |
| "loss": 0.2775, |
| "num_input_tokens_seen": 537104, |
| "step": 4160 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 4.049694538116455, |
| "learning_rate": 3.8337003318520394e-05, |
| "loss": 0.3838, |
| "num_input_tokens_seen": 537776, |
| "step": 4165 |
| }, |
| { |
| "epoch": 3.8935574229691876, |
| "grad_norm": 9.887384414672852, |
| "learning_rate": 3.8302526673248796e-05, |
| "loss": 0.3456, |
| "num_input_tokens_seen": 538384, |
| "step": 4170 |
| }, |
| { |
| "epoch": 3.8982259570494864, |
| "grad_norm": 5.9591779708862305, |
| "learning_rate": 3.8268014700726876e-05, |
| "loss": 0.3213, |
| "num_input_tokens_seen": 539120, |
| "step": 4175 |
| }, |
| { |
| "epoch": 3.902894491129785, |
| "grad_norm": 5.987460613250732, |
| "learning_rate": 3.8233467492607354e-05, |
| "loss": 0.5887, |
| "num_input_tokens_seen": 539792, |
| "step": 4180 |
| }, |
| { |
| "epoch": 3.907563025210084, |
| "grad_norm": 5.307256698608398, |
| "learning_rate": 3.819888514063658e-05, |
| "loss": 0.4946, |
| "num_input_tokens_seen": 540496, |
| "step": 4185 |
| }, |
| { |
| "epoch": 3.9122315592903827, |
| "grad_norm": 5.681676387786865, |
| "learning_rate": 3.8164267736654166e-05, |
| "loss": 0.3623, |
| "num_input_tokens_seen": 541088, |
| "step": 4190 |
| }, |
| { |
| "epoch": 3.9169000933706815, |
| "grad_norm": 6.065107822418213, |
| "learning_rate": 3.812961537259289e-05, |
| "loss": 0.5706, |
| "num_input_tokens_seen": 541856, |
| "step": 4195 |
| }, |
| { |
| "epoch": 3.9215686274509802, |
| "grad_norm": 4.150299549102783, |
| "learning_rate": 3.809492814047831e-05, |
| "loss": 0.3745, |
| "num_input_tokens_seen": 542544, |
| "step": 4200 |
| }, |
| { |
| "epoch": 3.9262371615312794, |
| "grad_norm": 3.3918418884277344, |
| "learning_rate": 3.80602061324286e-05, |
| "loss": 0.6235, |
| "num_input_tokens_seen": 543152, |
| "step": 4205 |
| }, |
| { |
| "epoch": 3.930905695611578, |
| "grad_norm": 3.0744729042053223, |
| "learning_rate": 3.802544944065431e-05, |
| "loss": 0.4412, |
| "num_input_tokens_seen": 543760, |
| "step": 4210 |
| }, |
| { |
| "epoch": 3.935574229691877, |
| "grad_norm": 5.920441627502441, |
| "learning_rate": 3.799065815745808e-05, |
| "loss": 0.3594, |
| "num_input_tokens_seen": 544368, |
| "step": 4215 |
| }, |
| { |
| "epoch": 3.9402427637721757, |
| "grad_norm": 0.5871037244796753, |
| "learning_rate": 3.7955832375234404e-05, |
| "loss": 0.2189, |
| "num_input_tokens_seen": 544928, |
| "step": 4220 |
| }, |
| { |
| "epoch": 3.9449112978524745, |
| "grad_norm": 1.7833948135375977, |
| "learning_rate": 3.7920972186469406e-05, |
| "loss": 0.5074, |
| "num_input_tokens_seen": 545648, |
| "step": 4225 |
| }, |
| { |
| "epoch": 3.9495798319327733, |
| "grad_norm": 2.337198257446289, |
| "learning_rate": 3.788607768374059e-05, |
| "loss": 0.2963, |
| "num_input_tokens_seen": 546288, |
| "step": 4230 |
| }, |
| { |
| "epoch": 3.954248366013072, |
| "grad_norm": 2.6441376209259033, |
| "learning_rate": 3.785114895971658e-05, |
| "loss": 0.6003, |
| "num_input_tokens_seen": 546976, |
| "step": 4235 |
| }, |
| { |
| "epoch": 3.958916900093371, |
| "grad_norm": 10.86312198638916, |
| "learning_rate": 3.781618610715687e-05, |
| "loss": 0.725, |
| "num_input_tokens_seen": 547600, |
| "step": 4240 |
| }, |
| { |
| "epoch": 3.9635854341736696, |
| "grad_norm": 7.368520259857178, |
| "learning_rate": 3.77811892189116e-05, |
| "loss": 0.5826, |
| "num_input_tokens_seen": 548240, |
| "step": 4245 |
| }, |
| { |
| "epoch": 3.9682539682539684, |
| "grad_norm": 7.850865840911865, |
| "learning_rate": 3.774615838792131e-05, |
| "loss": 0.3734, |
| "num_input_tokens_seen": 548880, |
| "step": 4250 |
| }, |
| { |
| "epoch": 3.972922502334267, |
| "grad_norm": 6.826239585876465, |
| "learning_rate": 3.771109370721666e-05, |
| "loss": 0.5284, |
| "num_input_tokens_seen": 549456, |
| "step": 4255 |
| }, |
| { |
| "epoch": 3.977591036414566, |
| "grad_norm": 8.290151596069336, |
| "learning_rate": 3.7675995269918205e-05, |
| "loss": 0.4612, |
| "num_input_tokens_seen": 550064, |
| "step": 4260 |
| }, |
| { |
| "epoch": 3.9822595704948647, |
| "grad_norm": 7.847808361053467, |
| "learning_rate": 3.764086316923616e-05, |
| "loss": 0.3022, |
| "num_input_tokens_seen": 550784, |
| "step": 4265 |
| }, |
| { |
| "epoch": 3.9869281045751634, |
| "grad_norm": 4.398458957672119, |
| "learning_rate": 3.760569749847013e-05, |
| "loss": 0.1849, |
| "num_input_tokens_seen": 551472, |
| "step": 4270 |
| }, |
| { |
| "epoch": 3.991596638655462, |
| "grad_norm": 3.6097769737243652, |
| "learning_rate": 3.757049835100888e-05, |
| "loss": 0.2915, |
| "num_input_tokens_seen": 552080, |
| "step": 4275 |
| }, |
| { |
| "epoch": 3.996265172735761, |
| "grad_norm": 5.2276930809021, |
| "learning_rate": 3.753526582033007e-05, |
| "loss": 0.295, |
| "num_input_tokens_seen": 552688, |
| "step": 4280 |
| }, |
| { |
| "epoch": 4.00093370681606, |
| "grad_norm": 3.192323923110962, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.1977, |
| "num_input_tokens_seen": 553248, |
| "step": 4285 |
| }, |
| { |
| "epoch": 4.003734827264239, |
| "eval_loss": 0.6991814374923706, |
| "eval_runtime": 3.8602, |
| "eval_samples_per_second": 61.654, |
| "eval_steps_per_second": 30.827, |
| "num_input_tokens_seen": 553632, |
| "step": 4288 |
| }, |
| { |
| "epoch": 4.0056022408963585, |
| "grad_norm": 2.7682571411132812, |
| "learning_rate": 3.7464700983673416e-05, |
| "loss": 0.1532, |
| "num_input_tokens_seen": 553840, |
| "step": 4290 |
| }, |
| { |
| "epoch": 4.010270774976657, |
| "grad_norm": 7.069489479064941, |
| "learning_rate": 3.74293688650932e-05, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 554464, |
| "step": 4295 |
| }, |
| { |
| "epoch": 4.014939309056956, |
| "grad_norm": 1.4517050981521606, |
| "learning_rate": 3.739400373809012e-05, |
| "loss": 0.3711, |
| "num_input_tokens_seen": 555072, |
| "step": 4300 |
| }, |
| { |
| "epoch": 4.019607843137255, |
| "grad_norm": 1.9523597955703735, |
| "learning_rate": 3.735860569658265e-05, |
| "loss": 0.1403, |
| "num_input_tokens_seen": 555696, |
| "step": 4305 |
| }, |
| { |
| "epoch": 4.024276377217554, |
| "grad_norm": 2.3467447757720947, |
| "learning_rate": 3.7323174834576634e-05, |
| "loss": 0.2017, |
| "num_input_tokens_seen": 556352, |
| "step": 4310 |
| }, |
| { |
| "epoch": 4.028944911297852, |
| "grad_norm": 4.454637050628662, |
| "learning_rate": 3.728771124616511e-05, |
| "loss": 0.3451, |
| "num_input_tokens_seen": 556912, |
| "step": 4315 |
| }, |
| { |
| "epoch": 4.033613445378151, |
| "grad_norm": 12.304389953613281, |
| "learning_rate": 3.7252215025528004e-05, |
| "loss": 0.507, |
| "num_input_tokens_seen": 557488, |
| "step": 4320 |
| }, |
| { |
| "epoch": 4.03828197945845, |
| "grad_norm": 6.556013584136963, |
| "learning_rate": 3.72166862669319e-05, |
| "loss": 0.3884, |
| "num_input_tokens_seen": 558128, |
| "step": 4325 |
| }, |
| { |
| "epoch": 4.042950513538749, |
| "grad_norm": 5.730507850646973, |
| "learning_rate": 3.7181125064729815e-05, |
| "loss": 0.1663, |
| "num_input_tokens_seen": 558800, |
| "step": 4330 |
| }, |
| { |
| "epoch": 4.0476190476190474, |
| "grad_norm": 24.123844146728516, |
| "learning_rate": 3.714553151336091e-05, |
| "loss": 0.4761, |
| "num_input_tokens_seen": 559472, |
| "step": 4335 |
| }, |
| { |
| "epoch": 4.052287581699346, |
| "grad_norm": 4.613626480102539, |
| "learning_rate": 3.710990570735025e-05, |
| "loss": 0.1796, |
| "num_input_tokens_seen": 560080, |
| "step": 4340 |
| }, |
| { |
| "epoch": 4.056956115779645, |
| "grad_norm": 48.392879486083984, |
| "learning_rate": 3.707424774130858e-05, |
| "loss": 0.1985, |
| "num_input_tokens_seen": 560752, |
| "step": 4345 |
| }, |
| { |
| "epoch": 4.061624649859944, |
| "grad_norm": 4.906088829040527, |
| "learning_rate": 3.703855770993201e-05, |
| "loss": 0.2424, |
| "num_input_tokens_seen": 561504, |
| "step": 4350 |
| }, |
| { |
| "epoch": 4.0662931839402425, |
| "grad_norm": 6.213633060455322, |
| "learning_rate": 3.700283570800187e-05, |
| "loss": 0.4731, |
| "num_input_tokens_seen": 562080, |
| "step": 4355 |
| }, |
| { |
| "epoch": 4.070961718020541, |
| "grad_norm": 2.7080910205841064, |
| "learning_rate": 3.696708183038432e-05, |
| "loss": 0.2403, |
| "num_input_tokens_seen": 562752, |
| "step": 4360 |
| }, |
| { |
| "epoch": 4.07563025210084, |
| "grad_norm": 1.7861884832382202, |
| "learning_rate": 3.6931296172030236e-05, |
| "loss": 0.3308, |
| "num_input_tokens_seen": 563392, |
| "step": 4365 |
| }, |
| { |
| "epoch": 4.080298786181139, |
| "grad_norm": 2.173994779586792, |
| "learning_rate": 3.689547882797485e-05, |
| "loss": 0.3537, |
| "num_input_tokens_seen": 564080, |
| "step": 4370 |
| }, |
| { |
| "epoch": 4.084967320261438, |
| "grad_norm": 6.196120738983154, |
| "learning_rate": 3.6859629893337556e-05, |
| "loss": 0.286, |
| "num_input_tokens_seen": 564768, |
| "step": 4375 |
| }, |
| { |
| "epoch": 4.089635854341736, |
| "grad_norm": 7.646449089050293, |
| "learning_rate": 3.682374946332165e-05, |
| "loss": 0.4532, |
| "num_input_tokens_seen": 565392, |
| "step": 4380 |
| }, |
| { |
| "epoch": 4.094304388422035, |
| "grad_norm": 6.959907531738281, |
| "learning_rate": 3.6787837633214064e-05, |
| "loss": 0.2021, |
| "num_input_tokens_seen": 565984, |
| "step": 4385 |
| }, |
| { |
| "epoch": 4.098972922502334, |
| "grad_norm": 4.212170124053955, |
| "learning_rate": 3.67518944983851e-05, |
| "loss": 0.2379, |
| "num_input_tokens_seen": 566608, |
| "step": 4390 |
| }, |
| { |
| "epoch": 4.103641456582633, |
| "grad_norm": 4.175776481628418, |
| "learning_rate": 3.671592015428823e-05, |
| "loss": 0.2317, |
| "num_input_tokens_seen": 567232, |
| "step": 4395 |
| }, |
| { |
| "epoch": 4.1083099906629315, |
| "grad_norm": 5.702651500701904, |
| "learning_rate": 3.667991469645979e-05, |
| "loss": 0.2823, |
| "num_input_tokens_seen": 567888, |
| "step": 4400 |
| }, |
| { |
| "epoch": 4.11297852474323, |
| "grad_norm": 3.9328668117523193, |
| "learning_rate": 3.6643878220518736e-05, |
| "loss": 0.3263, |
| "num_input_tokens_seen": 568480, |
| "step": 4405 |
| }, |
| { |
| "epoch": 4.117647058823529, |
| "grad_norm": 10.232110023498535, |
| "learning_rate": 3.6607810822166404e-05, |
| "loss": 0.3833, |
| "num_input_tokens_seen": 569120, |
| "step": 4410 |
| }, |
| { |
| "epoch": 4.122315592903828, |
| "grad_norm": 4.055398464202881, |
| "learning_rate": 3.657171259718626e-05, |
| "loss": 0.3163, |
| "num_input_tokens_seen": 569696, |
| "step": 4415 |
| }, |
| { |
| "epoch": 4.1269841269841265, |
| "grad_norm": 3.281766414642334, |
| "learning_rate": 3.6535583641443634e-05, |
| "loss": 0.1429, |
| "num_input_tokens_seen": 570528, |
| "step": 4420 |
| }, |
| { |
| "epoch": 4.131652661064426, |
| "grad_norm": 16.788419723510742, |
| "learning_rate": 3.649942405088544e-05, |
| "loss": 0.6168, |
| "num_input_tokens_seen": 571168, |
| "step": 4425 |
| }, |
| { |
| "epoch": 4.136321195144725, |
| "grad_norm": 3.544717311859131, |
| "learning_rate": 3.646323392153999e-05, |
| "loss": 0.2535, |
| "num_input_tokens_seen": 571776, |
| "step": 4430 |
| }, |
| { |
| "epoch": 4.140989729225024, |
| "grad_norm": 5.109630107879639, |
| "learning_rate": 3.6427013349516664e-05, |
| "loss": 0.1899, |
| "num_input_tokens_seen": 572368, |
| "step": 4435 |
| }, |
| { |
| "epoch": 4.1456582633053225, |
| "grad_norm": 4.163717269897461, |
| "learning_rate": 3.639076243100571e-05, |
| "loss": 0.4238, |
| "num_input_tokens_seen": 573072, |
| "step": 4440 |
| }, |
| { |
| "epoch": 4.150326797385621, |
| "grad_norm": 13.520872116088867, |
| "learning_rate": 3.635448126227795e-05, |
| "loss": 0.4113, |
| "num_input_tokens_seen": 573680, |
| "step": 4445 |
| }, |
| { |
| "epoch": 4.15499533146592, |
| "grad_norm": 3.852015733718872, |
| "learning_rate": 3.631816993968455e-05, |
| "loss": 0.2477, |
| "num_input_tokens_seen": 574352, |
| "step": 4450 |
| }, |
| { |
| "epoch": 4.159663865546219, |
| "grad_norm": 26.530088424682617, |
| "learning_rate": 3.628182855965676e-05, |
| "loss": 0.1845, |
| "num_input_tokens_seen": 575056, |
| "step": 4455 |
| }, |
| { |
| "epoch": 4.164332399626518, |
| "grad_norm": 1.7594077587127686, |
| "learning_rate": 3.624545721870563e-05, |
| "loss": 0.338, |
| "num_input_tokens_seen": 575792, |
| "step": 4460 |
| }, |
| { |
| "epoch": 4.169000933706816, |
| "grad_norm": 1.8824595212936401, |
| "learning_rate": 3.6209056013421805e-05, |
| "loss": 0.2513, |
| "num_input_tokens_seen": 576432, |
| "step": 4465 |
| }, |
| { |
| "epoch": 4.173669467787115, |
| "grad_norm": 7.08219575881958, |
| "learning_rate": 3.617262504047523e-05, |
| "loss": 0.3619, |
| "num_input_tokens_seen": 577024, |
| "step": 4470 |
| }, |
| { |
| "epoch": 4.178338001867414, |
| "grad_norm": 1.5463865995407104, |
| "learning_rate": 3.613616439661489e-05, |
| "loss": 0.3355, |
| "num_input_tokens_seen": 577632, |
| "step": 4475 |
| }, |
| { |
| "epoch": 4.183006535947713, |
| "grad_norm": 4.219307899475098, |
| "learning_rate": 3.6099674178668595e-05, |
| "loss": 0.1978, |
| "num_input_tokens_seen": 578224, |
| "step": 4480 |
| }, |
| { |
| "epoch": 4.187675070028011, |
| "grad_norm": 24.176820755004883, |
| "learning_rate": 3.606315448354265e-05, |
| "loss": 0.517, |
| "num_input_tokens_seen": 578816, |
| "step": 4485 |
| }, |
| { |
| "epoch": 4.19234360410831, |
| "grad_norm": 9.95002555847168, |
| "learning_rate": 3.60266054082217e-05, |
| "loss": 0.3464, |
| "num_input_tokens_seen": 579472, |
| "step": 4490 |
| }, |
| { |
| "epoch": 4.197012138188609, |
| "grad_norm": 4.038425922393799, |
| "learning_rate": 3.599002704976835e-05, |
| "loss": 0.2076, |
| "num_input_tokens_seen": 580032, |
| "step": 4495 |
| }, |
| { |
| "epoch": 4.201680672268908, |
| "grad_norm": 1.5917898416519165, |
| "learning_rate": 3.595341950532304e-05, |
| "loss": 0.3451, |
| "num_input_tokens_seen": 580624, |
| "step": 4500 |
| }, |
| { |
| "epoch": 4.2063492063492065, |
| "grad_norm": 4.347362995147705, |
| "learning_rate": 3.591678287210366e-05, |
| "loss": 0.2847, |
| "num_input_tokens_seen": 581312, |
| "step": 4505 |
| }, |
| { |
| "epoch": 4.211017740429505, |
| "grad_norm": 2.4160451889038086, |
| "learning_rate": 3.588011724740537e-05, |
| "loss": 0.2936, |
| "num_input_tokens_seen": 581984, |
| "step": 4510 |
| }, |
| { |
| "epoch": 4.215686274509804, |
| "grad_norm": 3.665546417236328, |
| "learning_rate": 3.584342272860034e-05, |
| "loss": 0.0873, |
| "num_input_tokens_seen": 582720, |
| "step": 4515 |
| }, |
| { |
| "epoch": 4.220354808590103, |
| "grad_norm": 5.41824197769165, |
| "learning_rate": 3.580669941313746e-05, |
| "loss": 0.3641, |
| "num_input_tokens_seen": 583360, |
| "step": 4520 |
| }, |
| { |
| "epoch": 4.225023342670402, |
| "grad_norm": 7.571991920471191, |
| "learning_rate": 3.5769947398542086e-05, |
| "loss": 0.4328, |
| "num_input_tokens_seen": 584032, |
| "step": 4525 |
| }, |
| { |
| "epoch": 4.2296918767507, |
| "grad_norm": 7.419412136077881, |
| "learning_rate": 3.5733166782415805e-05, |
| "loss": 0.435, |
| "num_input_tokens_seen": 584672, |
| "step": 4530 |
| }, |
| { |
| "epoch": 4.234360410830999, |
| "grad_norm": 3.60376238822937, |
| "learning_rate": 3.5696357662436145e-05, |
| "loss": 0.3029, |
| "num_input_tokens_seen": 585296, |
| "step": 4535 |
| }, |
| { |
| "epoch": 4.239028944911298, |
| "grad_norm": 0.17386208474636078, |
| "learning_rate": 3.565952013635635e-05, |
| "loss": 0.3092, |
| "num_input_tokens_seen": 585984, |
| "step": 4540 |
| }, |
| { |
| "epoch": 4.243697478991597, |
| "grad_norm": 0.5566981434822083, |
| "learning_rate": 3.562265430200508e-05, |
| "loss": 0.3977, |
| "num_input_tokens_seen": 586624, |
| "step": 4545 |
| }, |
| { |
| "epoch": 4.248366013071895, |
| "grad_norm": 7.640924453735352, |
| "learning_rate": 3.558576025728619e-05, |
| "loss": 0.2716, |
| "num_input_tokens_seen": 587216, |
| "step": 4550 |
| }, |
| { |
| "epoch": 4.253034547152194, |
| "grad_norm": 4.149720668792725, |
| "learning_rate": 3.554883810017844e-05, |
| "loss": 0.3383, |
| "num_input_tokens_seen": 587920, |
| "step": 4555 |
| }, |
| { |
| "epoch": 4.257703081232493, |
| "grad_norm": 12.189506530761719, |
| "learning_rate": 3.551188792873527e-05, |
| "loss": 0.4403, |
| "num_input_tokens_seen": 588528, |
| "step": 4560 |
| }, |
| { |
| "epoch": 4.262371615312792, |
| "grad_norm": 5.4155426025390625, |
| "learning_rate": 3.547490984108449e-05, |
| "loss": 0.5473, |
| "num_input_tokens_seen": 589136, |
| "step": 4565 |
| }, |
| { |
| "epoch": 4.2670401493930905, |
| "grad_norm": 5.073611736297607, |
| "learning_rate": 3.543790393542805e-05, |
| "loss": 0.325, |
| "num_input_tokens_seen": 589744, |
| "step": 4570 |
| }, |
| { |
| "epoch": 4.271708683473389, |
| "grad_norm": 8.7191801071167, |
| "learning_rate": 3.5400870310041794e-05, |
| "loss": 0.3047, |
| "num_input_tokens_seen": 590368, |
| "step": 4575 |
| }, |
| { |
| "epoch": 4.276377217553688, |
| "grad_norm": 7.070326328277588, |
| "learning_rate": 3.536380906327516e-05, |
| "loss": 0.2448, |
| "num_input_tokens_seen": 590992, |
| "step": 4580 |
| }, |
| { |
| "epoch": 4.281045751633987, |
| "grad_norm": 6.877079963684082, |
| "learning_rate": 3.532672029355097e-05, |
| "loss": 0.3595, |
| "num_input_tokens_seen": 591744, |
| "step": 4585 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 2.179201364517212, |
| "learning_rate": 3.5289604099365096e-05, |
| "loss": 0.123, |
| "num_input_tokens_seen": 592368, |
| "step": 4590 |
| }, |
| { |
| "epoch": 4.290382819794584, |
| "grad_norm": 3.5357630252838135, |
| "learning_rate": 3.525246057928627e-05, |
| "loss": 0.1877, |
| "num_input_tokens_seen": 593056, |
| "step": 4595 |
| }, |
| { |
| "epoch": 4.295051353874883, |
| "grad_norm": 4.004199028015137, |
| "learning_rate": 3.5215289831955786e-05, |
| "loss": 0.3331, |
| "num_input_tokens_seen": 593632, |
| "step": 4600 |
| }, |
| { |
| "epoch": 4.299719887955182, |
| "grad_norm": 5.568762302398682, |
| "learning_rate": 3.517809195608725e-05, |
| "loss": 0.5129, |
| "num_input_tokens_seen": 594272, |
| "step": 4605 |
| }, |
| { |
| "epoch": 4.304388422035481, |
| "grad_norm": 17.591794967651367, |
| "learning_rate": 3.5140867050466295e-05, |
| "loss": 0.2957, |
| "num_input_tokens_seen": 594960, |
| "step": 4610 |
| }, |
| { |
| "epoch": 4.309056956115779, |
| "grad_norm": 10.526442527770996, |
| "learning_rate": 3.510361521395035e-05, |
| "loss": 0.2965, |
| "num_input_tokens_seen": 595680, |
| "step": 4615 |
| }, |
| { |
| "epoch": 4.313725490196078, |
| "grad_norm": 5.713769435882568, |
| "learning_rate": 3.506633654546837e-05, |
| "loss": 0.2579, |
| "num_input_tokens_seen": 596320, |
| "step": 4620 |
| }, |
| { |
| "epoch": 4.318394024276377, |
| "grad_norm": 3.8671867847442627, |
| "learning_rate": 3.502903114402055e-05, |
| "loss": 0.1889, |
| "num_input_tokens_seen": 596976, |
| "step": 4625 |
| }, |
| { |
| "epoch": 4.323062558356676, |
| "grad_norm": 2.351252794265747, |
| "learning_rate": 3.499169910867809e-05, |
| "loss": 0.1431, |
| "num_input_tokens_seen": 597648, |
| "step": 4630 |
| }, |
| { |
| "epoch": 4.3277310924369745, |
| "grad_norm": 8.63907527923584, |
| "learning_rate": 3.4954340538582926e-05, |
| "loss": 0.5485, |
| "num_input_tokens_seen": 598336, |
| "step": 4635 |
| }, |
| { |
| "epoch": 4.332399626517273, |
| "grad_norm": 4.3061299324035645, |
| "learning_rate": 3.491695553294745e-05, |
| "loss": 0.3966, |
| "num_input_tokens_seen": 598928, |
| "step": 4640 |
| }, |
| { |
| "epoch": 4.337068160597572, |
| "grad_norm": 5.864599227905273, |
| "learning_rate": 3.487954419105426e-05, |
| "loss": 0.4937, |
| "num_input_tokens_seen": 599600, |
| "step": 4645 |
| }, |
| { |
| "epoch": 4.341736694677871, |
| "grad_norm": 5.271458148956299, |
| "learning_rate": 3.484210661225591e-05, |
| "loss": 0.4097, |
| "num_input_tokens_seen": 600176, |
| "step": 4650 |
| }, |
| { |
| "epoch": 4.34640522875817, |
| "grad_norm": 5.326613426208496, |
| "learning_rate": 3.4804642895974596e-05, |
| "loss": 0.244, |
| "num_input_tokens_seen": 600896, |
| "step": 4655 |
| }, |
| { |
| "epoch": 4.351073762838468, |
| "grad_norm": 4.950856685638428, |
| "learning_rate": 3.476715314170198e-05, |
| "loss": 0.414, |
| "num_input_tokens_seen": 601472, |
| "step": 4660 |
| }, |
| { |
| "epoch": 4.355742296918767, |
| "grad_norm": 11.154131889343262, |
| "learning_rate": 3.472963744899882e-05, |
| "loss": 0.4395, |
| "num_input_tokens_seen": 602224, |
| "step": 4665 |
| }, |
| { |
| "epoch": 4.360410830999066, |
| "grad_norm": 10.789007186889648, |
| "learning_rate": 3.4692095917494785e-05, |
| "loss": 0.4422, |
| "num_input_tokens_seen": 602848, |
| "step": 4670 |
| }, |
| { |
| "epoch": 4.365079365079365, |
| "grad_norm": 3.9653983116149902, |
| "learning_rate": 3.465452864688816e-05, |
| "loss": 0.4116, |
| "num_input_tokens_seen": 603392, |
| "step": 4675 |
| }, |
| { |
| "epoch": 4.369747899159664, |
| "grad_norm": 3.1383543014526367, |
| "learning_rate": 3.461693573694558e-05, |
| "loss": 0.3692, |
| "num_input_tokens_seen": 604032, |
| "step": 4680 |
| }, |
| { |
| "epoch": 4.374416433239963, |
| "grad_norm": 1.6144094467163086, |
| "learning_rate": 3.457931728750179e-05, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 604688, |
| "step": 4685 |
| }, |
| { |
| "epoch": 4.379084967320262, |
| "grad_norm": 7.495258331298828, |
| "learning_rate": 3.4541673398459316e-05, |
| "loss": 0.332, |
| "num_input_tokens_seen": 605296, |
| "step": 4690 |
| }, |
| { |
| "epoch": 4.383753501400561, |
| "grad_norm": 2.3249220848083496, |
| "learning_rate": 3.4504004169788294e-05, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 605968, |
| "step": 4695 |
| }, |
| { |
| "epoch": 4.388422035480859, |
| "grad_norm": 4.117162227630615, |
| "learning_rate": 3.446630970152612e-05, |
| "loss": 0.2862, |
| "num_input_tokens_seen": 606704, |
| "step": 4700 |
| }, |
| { |
| "epoch": 4.393090569561158, |
| "grad_norm": 3.2297849655151367, |
| "learning_rate": 3.4428590093777244e-05, |
| "loss": 0.3799, |
| "num_input_tokens_seen": 607392, |
| "step": 4705 |
| }, |
| { |
| "epoch": 4.397759103641457, |
| "grad_norm": 2.561896800994873, |
| "learning_rate": 3.4390845446712836e-05, |
| "loss": 0.0899, |
| "num_input_tokens_seen": 608080, |
| "step": 4710 |
| }, |
| { |
| "epoch": 4.402427637721756, |
| "grad_norm": 2.327104091644287, |
| "learning_rate": 3.4353075860570614e-05, |
| "loss": 0.1184, |
| "num_input_tokens_seen": 608768, |
| "step": 4715 |
| }, |
| { |
| "epoch": 4.4070961718020545, |
| "grad_norm": 3.8246302604675293, |
| "learning_rate": 3.4315281435654484e-05, |
| "loss": 0.5739, |
| "num_input_tokens_seen": 609472, |
| "step": 4720 |
| }, |
| { |
| "epoch": 4.411764705882353, |
| "grad_norm": 8.933488845825195, |
| "learning_rate": 3.427746227233436e-05, |
| "loss": 0.7091, |
| "num_input_tokens_seen": 610144, |
| "step": 4725 |
| }, |
| { |
| "epoch": 4.416433239962652, |
| "grad_norm": 4.204476356506348, |
| "learning_rate": 3.4239618471045795e-05, |
| "loss": 0.2344, |
| "num_input_tokens_seen": 610704, |
| "step": 4730 |
| }, |
| { |
| "epoch": 4.421101774042951, |
| "grad_norm": 1.9256473779678345, |
| "learning_rate": 3.420175013228982e-05, |
| "loss": 0.2514, |
| "num_input_tokens_seen": 611408, |
| "step": 4735 |
| }, |
| { |
| "epoch": 4.42577030812325, |
| "grad_norm": 5.400211811065674, |
| "learning_rate": 3.416385735663262e-05, |
| "loss": 0.3381, |
| "num_input_tokens_seen": 612096, |
| "step": 4740 |
| }, |
| { |
| "epoch": 4.430438842203548, |
| "grad_norm": 5.82473611831665, |
| "learning_rate": 3.412594024470526e-05, |
| "loss": 0.6267, |
| "num_input_tokens_seen": 612656, |
| "step": 4745 |
| }, |
| { |
| "epoch": 4.435107376283847, |
| "grad_norm": 3.858933687210083, |
| "learning_rate": 3.408799889720345e-05, |
| "loss": 0.2105, |
| "num_input_tokens_seen": 613536, |
| "step": 4750 |
| }, |
| { |
| "epoch": 4.439775910364146, |
| "grad_norm": 2.8734130859375, |
| "learning_rate": 3.405003341488726e-05, |
| "loss": 0.3456, |
| "num_input_tokens_seen": 614112, |
| "step": 4755 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 2.022538900375366, |
| "learning_rate": 3.401204389858085e-05, |
| "loss": 0.1767, |
| "num_input_tokens_seen": 614832, |
| "step": 4760 |
| }, |
| { |
| "epoch": 4.449112978524743, |
| "grad_norm": 1.868141531944275, |
| "learning_rate": 3.3974030449172206e-05, |
| "loss": 0.2623, |
| "num_input_tokens_seen": 615456, |
| "step": 4765 |
| }, |
| { |
| "epoch": 4.453781512605042, |
| "grad_norm": 9.049056053161621, |
| "learning_rate": 3.393599316761288e-05, |
| "loss": 0.5222, |
| "num_input_tokens_seen": 616048, |
| "step": 4770 |
| }, |
| { |
| "epoch": 4.458450046685341, |
| "grad_norm": 2.4421088695526123, |
| "learning_rate": 3.389793215491769e-05, |
| "loss": 0.2303, |
| "num_input_tokens_seen": 616704, |
| "step": 4775 |
| }, |
| { |
| "epoch": 4.46311858076564, |
| "grad_norm": 4.034763813018799, |
| "learning_rate": 3.385984751216452e-05, |
| "loss": 0.2149, |
| "num_input_tokens_seen": 617424, |
| "step": 4780 |
| }, |
| { |
| "epoch": 4.4677871148459385, |
| "grad_norm": 1.4706504344940186, |
| "learning_rate": 3.382173934049397e-05, |
| "loss": 0.2602, |
| "num_input_tokens_seen": 618176, |
| "step": 4785 |
| }, |
| { |
| "epoch": 4.472455648926237, |
| "grad_norm": 7.432245254516602, |
| "learning_rate": 3.378360774110916e-05, |
| "loss": 0.473, |
| "num_input_tokens_seen": 618768, |
| "step": 4790 |
| }, |
| { |
| "epoch": 4.477124183006536, |
| "grad_norm": 7.518472671508789, |
| "learning_rate": 3.374545281527538e-05, |
| "loss": 0.1864, |
| "num_input_tokens_seen": 619520, |
| "step": 4795 |
| }, |
| { |
| "epoch": 4.481792717086835, |
| "grad_norm": 10.482436180114746, |
| "learning_rate": 3.370727466431989e-05, |
| "loss": 0.4683, |
| "num_input_tokens_seen": 620192, |
| "step": 4800 |
| }, |
| { |
| "epoch": 4.486461251167134, |
| "grad_norm": 2.4239566326141357, |
| "learning_rate": 3.3669073389631644e-05, |
| "loss": 0.2869, |
| "num_input_tokens_seen": 620800, |
| "step": 4805 |
| }, |
| { |
| "epoch": 4.491129785247432, |
| "grad_norm": 2.9639928340911865, |
| "learning_rate": 3.3630849092661e-05, |
| "loss": 0.2356, |
| "num_input_tokens_seen": 621440, |
| "step": 4810 |
| }, |
| { |
| "epoch": 4.495798319327731, |
| "grad_norm": 3.342315435409546, |
| "learning_rate": 3.359260187491943e-05, |
| "loss": 0.289, |
| "num_input_tokens_seen": 622080, |
| "step": 4815 |
| }, |
| { |
| "epoch": 4.50046685340803, |
| "grad_norm": 9.140509605407715, |
| "learning_rate": 3.3554331837979307e-05, |
| "loss": 0.2292, |
| "num_input_tokens_seen": 622752, |
| "step": 4820 |
| }, |
| { |
| "epoch": 4.504201680672269, |
| "eval_loss": 0.7412708401679993, |
| "eval_runtime": 3.8742, |
| "eval_samples_per_second": 61.431, |
| "eval_steps_per_second": 30.716, |
| "num_input_tokens_seen": 623280, |
| "step": 4824 |
| }, |
| { |
| "epoch": 4.505135387488329, |
| "grad_norm": 1.1927273273468018, |
| "learning_rate": 3.3516039083473595e-05, |
| "loss": 0.4, |
| "num_input_tokens_seen": 623392, |
| "step": 4825 |
| }, |
| { |
| "epoch": 4.509803921568627, |
| "grad_norm": 3.2595295906066895, |
| "learning_rate": 3.347772371309557e-05, |
| "loss": 0.3292, |
| "num_input_tokens_seen": 624016, |
| "step": 4830 |
| }, |
| { |
| "epoch": 4.514472455648926, |
| "grad_norm": 8.653651237487793, |
| "learning_rate": 3.34393858285986e-05, |
| "loss": 0.4914, |
| "num_input_tokens_seen": 624752, |
| "step": 4835 |
| }, |
| { |
| "epoch": 4.519140989729225, |
| "grad_norm": 5.411193370819092, |
| "learning_rate": 3.340102553179581e-05, |
| "loss": 0.4863, |
| "num_input_tokens_seen": 625440, |
| "step": 4840 |
| }, |
| { |
| "epoch": 4.523809523809524, |
| "grad_norm": 6.883440017700195, |
| "learning_rate": 3.336264292455989e-05, |
| "loss": 0.3992, |
| "num_input_tokens_seen": 625984, |
| "step": 4845 |
| }, |
| { |
| "epoch": 4.5284780578898225, |
| "grad_norm": 4.110279083251953, |
| "learning_rate": 3.3324238108822726e-05, |
| "loss": 0.3535, |
| "num_input_tokens_seen": 626608, |
| "step": 4850 |
| }, |
| { |
| "epoch": 4.533146591970121, |
| "grad_norm": 2.576063394546509, |
| "learning_rate": 3.328581118657522e-05, |
| "loss": 0.2165, |
| "num_input_tokens_seen": 627280, |
| "step": 4855 |
| }, |
| { |
| "epoch": 4.53781512605042, |
| "grad_norm": 0.18222852051258087, |
| "learning_rate": 3.3247362259866956e-05, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 627920, |
| "step": 4860 |
| }, |
| { |
| "epoch": 4.542483660130719, |
| "grad_norm": 10.023906707763672, |
| "learning_rate": 3.3208891430805994e-05, |
| "loss": 0.409, |
| "num_input_tokens_seen": 628496, |
| "step": 4865 |
| }, |
| { |
| "epoch": 4.547152194211018, |
| "grad_norm": 4.180869102478027, |
| "learning_rate": 3.317039880155852e-05, |
| "loss": 0.2893, |
| "num_input_tokens_seen": 629216, |
| "step": 4870 |
| }, |
| { |
| "epoch": 4.551820728291316, |
| "grad_norm": 9.3419189453125, |
| "learning_rate": 3.313188447434862e-05, |
| "loss": 0.1579, |
| "num_input_tokens_seen": 629904, |
| "step": 4875 |
| }, |
| { |
| "epoch": 4.556489262371615, |
| "grad_norm": 2.9429197311401367, |
| "learning_rate": 3.309334855145803e-05, |
| "loss": 0.1973, |
| "num_input_tokens_seen": 630544, |
| "step": 4880 |
| }, |
| { |
| "epoch": 4.561157796451914, |
| "grad_norm": 10.754371643066406, |
| "learning_rate": 3.3054791135225804e-05, |
| "loss": 0.371, |
| "num_input_tokens_seen": 631120, |
| "step": 4885 |
| }, |
| { |
| "epoch": 4.565826330532213, |
| "grad_norm": 6.133115291595459, |
| "learning_rate": 3.30162123280481e-05, |
| "loss": 0.3092, |
| "num_input_tokens_seen": 631760, |
| "step": 4890 |
| }, |
| { |
| "epoch": 4.570494864612511, |
| "grad_norm": 2.5016603469848633, |
| "learning_rate": 3.297761223237788e-05, |
| "loss": 0.2878, |
| "num_input_tokens_seen": 632384, |
| "step": 4895 |
| }, |
| { |
| "epoch": 4.57516339869281, |
| "grad_norm": 5.324434757232666, |
| "learning_rate": 3.293899095072461e-05, |
| "loss": 0.3063, |
| "num_input_tokens_seen": 633024, |
| "step": 4900 |
| }, |
| { |
| "epoch": 4.579831932773109, |
| "grad_norm": 8.173105239868164, |
| "learning_rate": 3.2900348585654076e-05, |
| "loss": 0.2217, |
| "num_input_tokens_seen": 633648, |
| "step": 4905 |
| }, |
| { |
| "epoch": 4.584500466853408, |
| "grad_norm": 2.823427677154541, |
| "learning_rate": 3.286168523978801e-05, |
| "loss": 0.3353, |
| "num_input_tokens_seen": 634272, |
| "step": 4910 |
| }, |
| { |
| "epoch": 4.5891690009337065, |
| "grad_norm": 16.714040756225586, |
| "learning_rate": 3.282300101580386e-05, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 634976, |
| "step": 4915 |
| }, |
| { |
| "epoch": 4.593837535014005, |
| "grad_norm": 3.048602819442749, |
| "learning_rate": 3.278429601643456e-05, |
| "loss": 0.3076, |
| "num_input_tokens_seen": 635584, |
| "step": 4920 |
| }, |
| { |
| "epoch": 4.598506069094304, |
| "grad_norm": 7.008090972900391, |
| "learning_rate": 3.2745570344468166e-05, |
| "loss": 0.2682, |
| "num_input_tokens_seen": 636208, |
| "step": 4925 |
| }, |
| { |
| "epoch": 4.603174603174603, |
| "grad_norm": 4.961944103240967, |
| "learning_rate": 3.2706824102747694e-05, |
| "loss": 0.1542, |
| "num_input_tokens_seen": 636864, |
| "step": 4930 |
| }, |
| { |
| "epoch": 4.607843137254902, |
| "grad_norm": 8.771209716796875, |
| "learning_rate": 3.266805739417073e-05, |
| "loss": 0.5569, |
| "num_input_tokens_seen": 637536, |
| "step": 4935 |
| }, |
| { |
| "epoch": 4.6125116713352, |
| "grad_norm": 1.5831955671310425, |
| "learning_rate": 3.262927032168923e-05, |
| "loss": 0.3309, |
| "num_input_tokens_seen": 638176, |
| "step": 4940 |
| }, |
| { |
| "epoch": 4.617180205415499, |
| "grad_norm": 12.607647895812988, |
| "learning_rate": 3.259046298830924e-05, |
| "loss": 0.2684, |
| "num_input_tokens_seen": 638848, |
| "step": 4945 |
| }, |
| { |
| "epoch": 4.621848739495798, |
| "grad_norm": 7.591634750366211, |
| "learning_rate": 3.255163549709063e-05, |
| "loss": 0.3422, |
| "num_input_tokens_seen": 639520, |
| "step": 4950 |
| }, |
| { |
| "epoch": 4.626517273576097, |
| "grad_norm": 3.3472375869750977, |
| "learning_rate": 3.251278795114676e-05, |
| "loss": 0.236, |
| "num_input_tokens_seen": 640144, |
| "step": 4955 |
| }, |
| { |
| "epoch": 4.631185807656395, |
| "grad_norm": 6.409518241882324, |
| "learning_rate": 3.247392045364426e-05, |
| "loss": 0.2417, |
| "num_input_tokens_seen": 640800, |
| "step": 4960 |
| }, |
| { |
| "epoch": 4.635854341736694, |
| "grad_norm": 7.663181304931641, |
| "learning_rate": 3.243503310780278e-05, |
| "loss": 0.2252, |
| "num_input_tokens_seen": 641488, |
| "step": 4965 |
| }, |
| { |
| "epoch": 4.640522875816993, |
| "grad_norm": 4.352063179016113, |
| "learning_rate": 3.2396126016894646e-05, |
| "loss": 0.6517, |
| "num_input_tokens_seen": 642080, |
| "step": 4970 |
| }, |
| { |
| "epoch": 4.645191409897293, |
| "grad_norm": 3.6530802249908447, |
| "learning_rate": 3.2357199284244626e-05, |
| "loss": 0.1837, |
| "num_input_tokens_seen": 642752, |
| "step": 4975 |
| }, |
| { |
| "epoch": 4.649859943977591, |
| "grad_norm": 9.228134155273438, |
| "learning_rate": 3.231825301322966e-05, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 643264, |
| "step": 4980 |
| }, |
| { |
| "epoch": 4.65452847805789, |
| "grad_norm": 4.053821086883545, |
| "learning_rate": 3.227928730727857e-05, |
| "loss": 0.3454, |
| "num_input_tokens_seen": 643984, |
| "step": 4985 |
| }, |
| { |
| "epoch": 4.659197012138189, |
| "grad_norm": 7.429311275482178, |
| "learning_rate": 3.224030226987179e-05, |
| "loss": 0.311, |
| "num_input_tokens_seen": 644640, |
| "step": 4990 |
| }, |
| { |
| "epoch": 4.663865546218488, |
| "grad_norm": 4.688385963439941, |
| "learning_rate": 3.220129800454108e-05, |
| "loss": 0.5501, |
| "num_input_tokens_seen": 645280, |
| "step": 4995 |
| }, |
| { |
| "epoch": 4.6685340802987865, |
| "grad_norm": 4.372518539428711, |
| "learning_rate": 3.21622746148693e-05, |
| "loss": 0.2034, |
| "num_input_tokens_seen": 645888, |
| "step": 5000 |
| }, |
| { |
| "epoch": 4.673202614379085, |
| "grad_norm": 6.792498588562012, |
| "learning_rate": 3.212323220449006e-05, |
| "loss": 0.2641, |
| "num_input_tokens_seen": 646560, |
| "step": 5005 |
| }, |
| { |
| "epoch": 4.677871148459384, |
| "grad_norm": 6.591304779052734, |
| "learning_rate": 3.2084170877087504e-05, |
| "loss": 0.2674, |
| "num_input_tokens_seen": 647120, |
| "step": 5010 |
| }, |
| { |
| "epoch": 4.682539682539683, |
| "grad_norm": 2.276726007461548, |
| "learning_rate": 3.2045090736396006e-05, |
| "loss": 0.265, |
| "num_input_tokens_seen": 647696, |
| "step": 5015 |
| }, |
| { |
| "epoch": 4.6872082166199815, |
| "grad_norm": 7.44296407699585, |
| "learning_rate": 3.200599188619989e-05, |
| "loss": 0.1148, |
| "num_input_tokens_seen": 648464, |
| "step": 5020 |
| }, |
| { |
| "epoch": 4.69187675070028, |
| "grad_norm": 6.232602596282959, |
| "learning_rate": 3.196687443033321e-05, |
| "loss": 0.3454, |
| "num_input_tokens_seen": 649008, |
| "step": 5025 |
| }, |
| { |
| "epoch": 4.696545284780579, |
| "grad_norm": 6.96764612197876, |
| "learning_rate": 3.192773847267937e-05, |
| "loss": 0.6691, |
| "num_input_tokens_seen": 649568, |
| "step": 5030 |
| }, |
| { |
| "epoch": 4.701213818860878, |
| "grad_norm": 19.437314987182617, |
| "learning_rate": 3.188858411717095e-05, |
| "loss": 0.4101, |
| "num_input_tokens_seen": 650224, |
| "step": 5035 |
| }, |
| { |
| "epoch": 4.705882352941177, |
| "grad_norm": 1.8838727474212646, |
| "learning_rate": 3.184941146778938e-05, |
| "loss": 0.3288, |
| "num_input_tokens_seen": 651152, |
| "step": 5040 |
| }, |
| { |
| "epoch": 4.710550887021475, |
| "grad_norm": 2.7591898441314697, |
| "learning_rate": 3.181022062856466e-05, |
| "loss": 0.1966, |
| "num_input_tokens_seen": 651712, |
| "step": 5045 |
| }, |
| { |
| "epoch": 4.715219421101774, |
| "grad_norm": 0.3163776397705078, |
| "learning_rate": 3.177101170357513e-05, |
| "loss": 0.2892, |
| "num_input_tokens_seen": 652384, |
| "step": 5050 |
| }, |
| { |
| "epoch": 4.719887955182073, |
| "grad_norm": 8.201074600219727, |
| "learning_rate": 3.173178479694712e-05, |
| "loss": 0.3812, |
| "num_input_tokens_seen": 653072, |
| "step": 5055 |
| }, |
| { |
| "epoch": 4.724556489262372, |
| "grad_norm": 2.460468292236328, |
| "learning_rate": 3.1692540012854726e-05, |
| "loss": 0.2103, |
| "num_input_tokens_seen": 653712, |
| "step": 5060 |
| }, |
| { |
| "epoch": 4.7292250233426705, |
| "grad_norm": 3.513843297958374, |
| "learning_rate": 3.165327745551954e-05, |
| "loss": 0.4444, |
| "num_input_tokens_seen": 654368, |
| "step": 5065 |
| }, |
| { |
| "epoch": 4.733893557422969, |
| "grad_norm": 4.301528453826904, |
| "learning_rate": 3.161399722921033e-05, |
| "loss": 0.5824, |
| "num_input_tokens_seen": 655040, |
| "step": 5070 |
| }, |
| { |
| "epoch": 4.738562091503268, |
| "grad_norm": 4.474994659423828, |
| "learning_rate": 3.1574699438242804e-05, |
| "loss": 0.4809, |
| "num_input_tokens_seen": 655600, |
| "step": 5075 |
| }, |
| { |
| "epoch": 4.743230625583567, |
| "grad_norm": 4.316887378692627, |
| "learning_rate": 3.15353841869793e-05, |
| "loss": 0.1618, |
| "num_input_tokens_seen": 656224, |
| "step": 5080 |
| }, |
| { |
| "epoch": 4.7478991596638656, |
| "grad_norm": 7.845536708831787, |
| "learning_rate": 3.149605157982852e-05, |
| "loss": 0.5412, |
| "num_input_tokens_seen": 656832, |
| "step": 5085 |
| }, |
| { |
| "epoch": 4.752567693744164, |
| "grad_norm": 9.774452209472656, |
| "learning_rate": 3.1456701721245305e-05, |
| "loss": 0.3414, |
| "num_input_tokens_seen": 657440, |
| "step": 5090 |
| }, |
| { |
| "epoch": 4.757236227824463, |
| "grad_norm": 3.0468742847442627, |
| "learning_rate": 3.1417334715730265e-05, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 658032, |
| "step": 5095 |
| }, |
| { |
| "epoch": 4.761904761904762, |
| "grad_norm": 9.727741241455078, |
| "learning_rate": 3.137795066782954e-05, |
| "loss": 0.339, |
| "num_input_tokens_seen": 658704, |
| "step": 5100 |
| }, |
| { |
| "epoch": 4.766573295985061, |
| "grad_norm": 14.910123825073242, |
| "learning_rate": 3.1338549682134564e-05, |
| "loss": 0.3962, |
| "num_input_tokens_seen": 659328, |
| "step": 5105 |
| }, |
| { |
| "epoch": 4.771241830065359, |
| "grad_norm": 13.144892692565918, |
| "learning_rate": 3.1299131863281734e-05, |
| "loss": 0.5179, |
| "num_input_tokens_seen": 659968, |
| "step": 5110 |
| }, |
| { |
| "epoch": 4.775910364145658, |
| "grad_norm": 4.052852630615234, |
| "learning_rate": 3.125969731595215e-05, |
| "loss": 0.3517, |
| "num_input_tokens_seen": 660608, |
| "step": 5115 |
| }, |
| { |
| "epoch": 4.780578898225957, |
| "grad_norm": 2.2380478382110596, |
| "learning_rate": 3.1220246144871334e-05, |
| "loss": 0.1959, |
| "num_input_tokens_seen": 661184, |
| "step": 5120 |
| }, |
| { |
| "epoch": 4.785247432306256, |
| "grad_norm": 3.6989505290985107, |
| "learning_rate": 3.118077845480897e-05, |
| "loss": 0.2745, |
| "num_input_tokens_seen": 661808, |
| "step": 5125 |
| }, |
| { |
| "epoch": 4.7899159663865545, |
| "grad_norm": 3.7099645137786865, |
| "learning_rate": 3.11412943505786e-05, |
| "loss": 0.1971, |
| "num_input_tokens_seen": 662576, |
| "step": 5130 |
| }, |
| { |
| "epoch": 4.794584500466853, |
| "grad_norm": 8.147356986999512, |
| "learning_rate": 3.110179393703737e-05, |
| "loss": 0.3137, |
| "num_input_tokens_seen": 663264, |
| "step": 5135 |
| }, |
| { |
| "epoch": 4.799253034547152, |
| "grad_norm": 3.5122320652008057, |
| "learning_rate": 3.106227731908569e-05, |
| "loss": 0.168, |
| "num_input_tokens_seen": 663888, |
| "step": 5140 |
| }, |
| { |
| "epoch": 4.803921568627451, |
| "grad_norm": 1.2344309091567993, |
| "learning_rate": 3.1022744601667076e-05, |
| "loss": 0.2979, |
| "num_input_tokens_seen": 664512, |
| "step": 5145 |
| }, |
| { |
| "epoch": 4.80859010270775, |
| "grad_norm": 2.26531720161438, |
| "learning_rate": 3.0983195889767756e-05, |
| "loss": 0.5175, |
| "num_input_tokens_seen": 665040, |
| "step": 5150 |
| }, |
| { |
| "epoch": 4.813258636788048, |
| "grad_norm": 5.6935930252075195, |
| "learning_rate": 3.0943631288416444e-05, |
| "loss": 0.4906, |
| "num_input_tokens_seen": 665696, |
| "step": 5155 |
| }, |
| { |
| "epoch": 4.817927170868347, |
| "grad_norm": 6.8716912269592285, |
| "learning_rate": 3.0904050902684046e-05, |
| "loss": 0.4338, |
| "num_input_tokens_seen": 666320, |
| "step": 5160 |
| }, |
| { |
| "epoch": 4.822595704948646, |
| "grad_norm": 3.8555526733398438, |
| "learning_rate": 3.086445483768338e-05, |
| "loss": 0.3313, |
| "num_input_tokens_seen": 666944, |
| "step": 5165 |
| }, |
| { |
| "epoch": 4.827264239028945, |
| "grad_norm": 10.221288681030273, |
| "learning_rate": 3.082484319856893e-05, |
| "loss": 0.3209, |
| "num_input_tokens_seen": 667504, |
| "step": 5170 |
| }, |
| { |
| "epoch": 4.831932773109243, |
| "grad_norm": 4.305709362030029, |
| "learning_rate": 3.0785216090536514e-05, |
| "loss": 0.3736, |
| "num_input_tokens_seen": 668176, |
| "step": 5175 |
| }, |
| { |
| "epoch": 4.836601307189542, |
| "grad_norm": 3.446392774581909, |
| "learning_rate": 3.0745573618823046e-05, |
| "loss": 0.3752, |
| "num_input_tokens_seen": 668784, |
| "step": 5180 |
| }, |
| { |
| "epoch": 4.841269841269841, |
| "grad_norm": 2.5847973823547363, |
| "learning_rate": 3.070591588870622e-05, |
| "loss": 0.3039, |
| "num_input_tokens_seen": 669488, |
| "step": 5185 |
| }, |
| { |
| "epoch": 4.84593837535014, |
| "grad_norm": 17.622394561767578, |
| "learning_rate": 3.066624300550427e-05, |
| "loss": 0.8055, |
| "num_input_tokens_seen": 670112, |
| "step": 5190 |
| }, |
| { |
| "epoch": 4.8506069094304385, |
| "grad_norm": 4.8430705070495605, |
| "learning_rate": 3.062655507457569e-05, |
| "loss": 0.1342, |
| "num_input_tokens_seen": 670848, |
| "step": 5195 |
| }, |
| { |
| "epoch": 4.855275443510737, |
| "grad_norm": 4.713992118835449, |
| "learning_rate": 3.058685220131888e-05, |
| "loss": 0.4015, |
| "num_input_tokens_seen": 671648, |
| "step": 5200 |
| }, |
| { |
| "epoch": 4.859943977591037, |
| "grad_norm": 6.5015692710876465, |
| "learning_rate": 3.054713449117197e-05, |
| "loss": 0.3398, |
| "num_input_tokens_seen": 672352, |
| "step": 5205 |
| }, |
| { |
| "epoch": 4.864612511671336, |
| "grad_norm": 1.2088795900344849, |
| "learning_rate": 3.0507402049612482e-05, |
| "loss": 0.2334, |
| "num_input_tokens_seen": 672976, |
| "step": 5210 |
| }, |
| { |
| "epoch": 4.8692810457516345, |
| "grad_norm": 4.3346710205078125, |
| "learning_rate": 3.046765498215705e-05, |
| "loss": 0.3382, |
| "num_input_tokens_seen": 673504, |
| "step": 5215 |
| }, |
| { |
| "epoch": 4.873949579831933, |
| "grad_norm": 6.621752738952637, |
| "learning_rate": 3.042789339436116e-05, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 674048, |
| "step": 5220 |
| }, |
| { |
| "epoch": 4.878618113912232, |
| "grad_norm": 9.04273796081543, |
| "learning_rate": 3.038811739181885e-05, |
| "loss": 0.3084, |
| "num_input_tokens_seen": 674688, |
| "step": 5225 |
| }, |
| { |
| "epoch": 4.883286647992531, |
| "grad_norm": 3.0526490211486816, |
| "learning_rate": 3.0348327080162435e-05, |
| "loss": 0.4238, |
| "num_input_tokens_seen": 675344, |
| "step": 5230 |
| }, |
| { |
| "epoch": 4.8879551820728295, |
| "grad_norm": 6.297845363616943, |
| "learning_rate": 3.0308522565062265e-05, |
| "loss": 0.2566, |
| "num_input_tokens_seen": 676064, |
| "step": 5235 |
| }, |
| { |
| "epoch": 4.892623716153128, |
| "grad_norm": 3.6078665256500244, |
| "learning_rate": 3.026870395222635e-05, |
| "loss": 0.2171, |
| "num_input_tokens_seen": 676704, |
| "step": 5240 |
| }, |
| { |
| "epoch": 4.897292250233427, |
| "grad_norm": 7.2723236083984375, |
| "learning_rate": 3.0228871347400194e-05, |
| "loss": 0.3802, |
| "num_input_tokens_seen": 677408, |
| "step": 5245 |
| }, |
| { |
| "epoch": 4.901960784313726, |
| "grad_norm": 16.52984046936035, |
| "learning_rate": 3.018902485636643e-05, |
| "loss": 0.437, |
| "num_input_tokens_seen": 677936, |
| "step": 5250 |
| }, |
| { |
| "epoch": 4.906629318394025, |
| "grad_norm": 7.225020885467529, |
| "learning_rate": 3.014916458494459e-05, |
| "loss": 0.388, |
| "num_input_tokens_seen": 678512, |
| "step": 5255 |
| }, |
| { |
| "epoch": 4.911297852474323, |
| "grad_norm": 0.8362892270088196, |
| "learning_rate": 3.0109290638990772e-05, |
| "loss": 0.2453, |
| "num_input_tokens_seen": 679104, |
| "step": 5260 |
| }, |
| { |
| "epoch": 4.915966386554622, |
| "grad_norm": 9.181543350219727, |
| "learning_rate": 3.0069403124397412e-05, |
| "loss": 0.3347, |
| "num_input_tokens_seen": 679712, |
| "step": 5265 |
| }, |
| { |
| "epoch": 4.920634920634921, |
| "grad_norm": 4.07673454284668, |
| "learning_rate": 3.002950214709297e-05, |
| "loss": 0.1877, |
| "num_input_tokens_seen": 680304, |
| "step": 5270 |
| }, |
| { |
| "epoch": 4.92530345471522, |
| "grad_norm": 4.580996513366699, |
| "learning_rate": 2.998958781304167e-05, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 680992, |
| "step": 5275 |
| }, |
| { |
| "epoch": 4.9299719887955185, |
| "grad_norm": 3.018458604812622, |
| "learning_rate": 2.994966022824319e-05, |
| "loss": 0.2046, |
| "num_input_tokens_seen": 681600, |
| "step": 5280 |
| }, |
| { |
| "epoch": 4.934640522875817, |
| "grad_norm": 4.071977138519287, |
| "learning_rate": 2.9909719498732414e-05, |
| "loss": 0.3367, |
| "num_input_tokens_seen": 682336, |
| "step": 5285 |
| }, |
| { |
| "epoch": 4.939309056956116, |
| "grad_norm": 2.586224317550659, |
| "learning_rate": 2.9869765730579125e-05, |
| "loss": 0.2818, |
| "num_input_tokens_seen": 683040, |
| "step": 5290 |
| }, |
| { |
| "epoch": 4.943977591036415, |
| "grad_norm": 9.728919982910156, |
| "learning_rate": 2.9829799029887738e-05, |
| "loss": 0.4241, |
| "num_input_tokens_seen": 683632, |
| "step": 5295 |
| }, |
| { |
| "epoch": 4.9486461251167135, |
| "grad_norm": 5.366093635559082, |
| "learning_rate": 2.9789819502797012e-05, |
| "loss": 0.2647, |
| "num_input_tokens_seen": 684304, |
| "step": 5300 |
| }, |
| { |
| "epoch": 4.953314659197012, |
| "grad_norm": 2.941143751144409, |
| "learning_rate": 2.9749827255479755e-05, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 684960, |
| "step": 5305 |
| }, |
| { |
| "epoch": 4.957983193277311, |
| "grad_norm": 2.2302086353302, |
| "learning_rate": 2.9709822394142572e-05, |
| "loss": 0.2222, |
| "num_input_tokens_seen": 685744, |
| "step": 5310 |
| }, |
| { |
| "epoch": 4.96265172735761, |
| "grad_norm": 6.357166767120361, |
| "learning_rate": 2.9669805025025567e-05, |
| "loss": 0.5765, |
| "num_input_tokens_seen": 686352, |
| "step": 5315 |
| }, |
| { |
| "epoch": 4.967320261437909, |
| "grad_norm": 6.87725305557251, |
| "learning_rate": 2.9629775254402053e-05, |
| "loss": 0.208, |
| "num_input_tokens_seen": 686960, |
| "step": 5320 |
| }, |
| { |
| "epoch": 4.971988795518207, |
| "grad_norm": 6.549231052398682, |
| "learning_rate": 2.958973318857827e-05, |
| "loss": 0.2937, |
| "num_input_tokens_seen": 687584, |
| "step": 5325 |
| }, |
| { |
| "epoch": 4.976657329598506, |
| "grad_norm": 1.1992058753967285, |
| "learning_rate": 2.9549678933893143e-05, |
| "loss": 0.3239, |
| "num_input_tokens_seen": 688272, |
| "step": 5330 |
| }, |
| { |
| "epoch": 4.981325863678805, |
| "grad_norm": 4.71303129196167, |
| "learning_rate": 2.950961259671793e-05, |
| "loss": 0.4471, |
| "num_input_tokens_seen": 688880, |
| "step": 5335 |
| }, |
| { |
| "epoch": 4.985994397759104, |
| "grad_norm": 4.6151933670043945, |
| "learning_rate": 2.946953428345598e-05, |
| "loss": 0.3116, |
| "num_input_tokens_seen": 689504, |
| "step": 5340 |
| }, |
| { |
| "epoch": 4.9906629318394025, |
| "grad_norm": 4.89030647277832, |
| "learning_rate": 2.942944410054248e-05, |
| "loss": 0.2833, |
| "num_input_tokens_seen": 690160, |
| "step": 5345 |
| }, |
| { |
| "epoch": 4.995331465919701, |
| "grad_norm": 14.496315956115723, |
| "learning_rate": 2.9389342154444093e-05, |
| "loss": 0.5057, |
| "num_input_tokens_seen": 690736, |
| "step": 5350 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 11.446124076843262, |
| "learning_rate": 2.9349228551658766e-05, |
| "loss": 0.5066, |
| "num_input_tokens_seen": 691304, |
| "step": 5355 |
| }, |
| { |
| "epoch": 5.004668534080299, |
| "grad_norm": 4.030084609985352, |
| "learning_rate": 2.930910339871536e-05, |
| "loss": 0.2427, |
| "num_input_tokens_seen": 691912, |
| "step": 5360 |
| }, |
| { |
| "epoch": 5.004668534080299, |
| "eval_loss": 0.7145299911499023, |
| "eval_runtime": 3.8675, |
| "eval_samples_per_second": 61.539, |
| "eval_steps_per_second": 30.769, |
| "num_input_tokens_seen": 691912, |
| "step": 5360 |
| }, |
| { |
| "epoch": 5.0093370681605975, |
| "grad_norm": 2.417534351348877, |
| "learning_rate": 2.9268966802173436e-05, |
| "loss": 0.1194, |
| "num_input_tokens_seen": 692440, |
| "step": 5365 |
| }, |
| { |
| "epoch": 5.014005602240896, |
| "grad_norm": 3.961277484893799, |
| "learning_rate": 2.9228818868622953e-05, |
| "loss": 0.4708, |
| "num_input_tokens_seen": 693032, |
| "step": 5370 |
| }, |
| { |
| "epoch": 5.018674136321195, |
| "grad_norm": 4.8509721755981445, |
| "learning_rate": 2.9188659704683953e-05, |
| "loss": 0.3194, |
| "num_input_tokens_seen": 693704, |
| "step": 5375 |
| }, |
| { |
| "epoch": 5.023342670401494, |
| "grad_norm": 2.1123046875, |
| "learning_rate": 2.9148489417006308e-05, |
| "loss": 0.1934, |
| "num_input_tokens_seen": 694328, |
| "step": 5380 |
| }, |
| { |
| "epoch": 5.028011204481793, |
| "grad_norm": 0.24643854796886444, |
| "learning_rate": 2.910830811226944e-05, |
| "loss": 0.1006, |
| "num_input_tokens_seen": 695000, |
| "step": 5385 |
| }, |
| { |
| "epoch": 5.032679738562091, |
| "grad_norm": 1.844314455986023, |
| "learning_rate": 2.9068115897182036e-05, |
| "loss": 0.1476, |
| "num_input_tokens_seen": 695656, |
| "step": 5390 |
| }, |
| { |
| "epoch": 5.03734827264239, |
| "grad_norm": 9.13713550567627, |
| "learning_rate": 2.902791287848173e-05, |
| "loss": 0.2345, |
| "num_input_tokens_seen": 696264, |
| "step": 5395 |
| }, |
| { |
| "epoch": 5.042016806722689, |
| "grad_norm": 1.417795181274414, |
| "learning_rate": 2.898769916293488e-05, |
| "loss": 0.1987, |
| "num_input_tokens_seen": 696856, |
| "step": 5400 |
| }, |
| { |
| "epoch": 5.046685340802988, |
| "grad_norm": 8.42477798461914, |
| "learning_rate": 2.894747485733622e-05, |
| "loss": 0.2888, |
| "num_input_tokens_seen": 697512, |
| "step": 5405 |
| }, |
| { |
| "epoch": 5.0513538748832865, |
| "grad_norm": 7.067439079284668, |
| "learning_rate": 2.8907240068508627e-05, |
| "loss": 0.3286, |
| "num_input_tokens_seen": 698152, |
| "step": 5410 |
| }, |
| { |
| "epoch": 5.056022408963585, |
| "grad_norm": 3.5997512340545654, |
| "learning_rate": 2.8866994903302823e-05, |
| "loss": 0.1111, |
| "num_input_tokens_seen": 698824, |
| "step": 5415 |
| }, |
| { |
| "epoch": 5.060690943043884, |
| "grad_norm": 4.830616474151611, |
| "learning_rate": 2.8826739468597068e-05, |
| "loss": 0.1264, |
| "num_input_tokens_seen": 699544, |
| "step": 5420 |
| }, |
| { |
| "epoch": 5.065359477124183, |
| "grad_norm": 15.35937786102295, |
| "learning_rate": 2.87864738712969e-05, |
| "loss": 0.2375, |
| "num_input_tokens_seen": 700216, |
| "step": 5425 |
| }, |
| { |
| "epoch": 5.0700280112044815, |
| "grad_norm": 0.5134664177894592, |
| "learning_rate": 2.8746198218334862e-05, |
| "loss": 0.1763, |
| "num_input_tokens_seen": 700840, |
| "step": 5430 |
| }, |
| { |
| "epoch": 5.07469654528478, |
| "grad_norm": 9.345810890197754, |
| "learning_rate": 2.870591261667018e-05, |
| "loss": 0.1846, |
| "num_input_tokens_seen": 701544, |
| "step": 5435 |
| }, |
| { |
| "epoch": 5.079365079365079, |
| "grad_norm": 11.941632270812988, |
| "learning_rate": 2.8665617173288516e-05, |
| "loss": 0.2576, |
| "num_input_tokens_seen": 702104, |
| "step": 5440 |
| }, |
| { |
| "epoch": 5.084033613445378, |
| "grad_norm": 5.5290422439575195, |
| "learning_rate": 2.8625311995201648e-05, |
| "loss": 0.0805, |
| "num_input_tokens_seen": 702776, |
| "step": 5445 |
| }, |
| { |
| "epoch": 5.088702147525677, |
| "grad_norm": 4.16331672668457, |
| "learning_rate": 2.8584997189447226e-05, |
| "loss": 0.1695, |
| "num_input_tokens_seen": 703400, |
| "step": 5450 |
| }, |
| { |
| "epoch": 5.093370681605975, |
| "grad_norm": 14.67473030090332, |
| "learning_rate": 2.854467286308848e-05, |
| "loss": 0.3571, |
| "num_input_tokens_seen": 704024, |
| "step": 5455 |
| }, |
| { |
| "epoch": 5.098039215686274, |
| "grad_norm": 4.0992255210876465, |
| "learning_rate": 2.8504339123213886e-05, |
| "loss": 0.0809, |
| "num_input_tokens_seen": 704712, |
| "step": 5460 |
| }, |
| { |
| "epoch": 5.102707749766573, |
| "grad_norm": 10.043177604675293, |
| "learning_rate": 2.8463996076936944e-05, |
| "loss": 0.1856, |
| "num_input_tokens_seen": 705384, |
| "step": 5465 |
| }, |
| { |
| "epoch": 5.107376283846872, |
| "grad_norm": 10.882224082946777, |
| "learning_rate": 2.8423643831395856e-05, |
| "loss": 0.3468, |
| "num_input_tokens_seen": 706056, |
| "step": 5470 |
| }, |
| { |
| "epoch": 5.1120448179271705, |
| "grad_norm": 2.696959972381592, |
| "learning_rate": 2.8383282493753283e-05, |
| "loss": 0.0661, |
| "num_input_tokens_seen": 706664, |
| "step": 5475 |
| }, |
| { |
| "epoch": 5.116713352007469, |
| "grad_norm": 12.80626106262207, |
| "learning_rate": 2.834291217119599e-05, |
| "loss": 0.3852, |
| "num_input_tokens_seen": 707224, |
| "step": 5480 |
| }, |
| { |
| "epoch": 5.121381886087768, |
| "grad_norm": 0.5678945183753967, |
| "learning_rate": 2.830253297093463e-05, |
| "loss": 0.1757, |
| "num_input_tokens_seen": 707800, |
| "step": 5485 |
| }, |
| { |
| "epoch": 5.126050420168067, |
| "grad_norm": 3.3095643520355225, |
| "learning_rate": 2.826214500020344e-05, |
| "loss": 0.2569, |
| "num_input_tokens_seen": 708456, |
| "step": 5490 |
| }, |
| { |
| "epoch": 5.130718954248366, |
| "grad_norm": 75.66603088378906, |
| "learning_rate": 2.8221748366259915e-05, |
| "loss": 0.2313, |
| "num_input_tokens_seen": 709080, |
| "step": 5495 |
| }, |
| { |
| "epoch": 5.135387488328665, |
| "grad_norm": 8.143814086914062, |
| "learning_rate": 2.818134317638459e-05, |
| "loss": 0.2562, |
| "num_input_tokens_seen": 709720, |
| "step": 5500 |
| }, |
| { |
| "epoch": 5.140056022408964, |
| "grad_norm": 3.207918405532837, |
| "learning_rate": 2.81409295378807e-05, |
| "loss": 0.1835, |
| "num_input_tokens_seen": 710344, |
| "step": 5505 |
| }, |
| { |
| "epoch": 5.144724556489263, |
| "grad_norm": 2.8475122451782227, |
| "learning_rate": 2.8100507558073924e-05, |
| "loss": 0.2123, |
| "num_input_tokens_seen": 711000, |
| "step": 5510 |
| }, |
| { |
| "epoch": 5.1493930905695615, |
| "grad_norm": 1.4689849615097046, |
| "learning_rate": 2.8060077344312106e-05, |
| "loss": 0.1878, |
| "num_input_tokens_seen": 711592, |
| "step": 5515 |
| }, |
| { |
| "epoch": 5.15406162464986, |
| "grad_norm": 6.083222389221191, |
| "learning_rate": 2.8019639003964936e-05, |
| "loss": 0.2004, |
| "num_input_tokens_seen": 712264, |
| "step": 5520 |
| }, |
| { |
| "epoch": 5.158730158730159, |
| "grad_norm": 4.9259748458862305, |
| "learning_rate": 2.7979192644423703e-05, |
| "loss": 0.1988, |
| "num_input_tokens_seen": 712936, |
| "step": 5525 |
| }, |
| { |
| "epoch": 5.163398692810458, |
| "grad_norm": 2.0269927978515625, |
| "learning_rate": 2.7938738373100966e-05, |
| "loss": 0.1394, |
| "num_input_tokens_seen": 713624, |
| "step": 5530 |
| }, |
| { |
| "epoch": 5.168067226890757, |
| "grad_norm": 17.339391708374023, |
| "learning_rate": 2.789827629743032e-05, |
| "loss": 0.355, |
| "num_input_tokens_seen": 714312, |
| "step": 5535 |
| }, |
| { |
| "epoch": 5.172735760971055, |
| "grad_norm": 4.577376365661621, |
| "learning_rate": 2.78578065248661e-05, |
| "loss": 0.1229, |
| "num_input_tokens_seen": 714968, |
| "step": 5540 |
| }, |
| { |
| "epoch": 5.177404295051354, |
| "grad_norm": 3.624584197998047, |
| "learning_rate": 2.781732916288303e-05, |
| "loss": 0.09, |
| "num_input_tokens_seen": 715608, |
| "step": 5545 |
| }, |
| { |
| "epoch": 5.182072829131653, |
| "grad_norm": 3.7476773262023926, |
| "learning_rate": 2.7776844318976035e-05, |
| "loss": 0.2231, |
| "num_input_tokens_seen": 716232, |
| "step": 5550 |
| }, |
| { |
| "epoch": 5.186741363211952, |
| "grad_norm": 3.4681551456451416, |
| "learning_rate": 2.773635210065989e-05, |
| "loss": 0.1263, |
| "num_input_tokens_seen": 716936, |
| "step": 5555 |
| }, |
| { |
| "epoch": 5.19140989729225, |
| "grad_norm": 4.748959064483643, |
| "learning_rate": 2.769585261546897e-05, |
| "loss": 0.2337, |
| "num_input_tokens_seen": 717576, |
| "step": 5560 |
| }, |
| { |
| "epoch": 5.196078431372549, |
| "grad_norm": 6.100517272949219, |
| "learning_rate": 2.765534597095692e-05, |
| "loss": 0.1429, |
| "num_input_tokens_seen": 718248, |
| "step": 5565 |
| }, |
| { |
| "epoch": 5.200746965452848, |
| "grad_norm": 3.645913600921631, |
| "learning_rate": 2.7614832274696416e-05, |
| "loss": 0.3484, |
| "num_input_tokens_seen": 718904, |
| "step": 5570 |
| }, |
| { |
| "epoch": 5.205415499533147, |
| "grad_norm": 6.494524002075195, |
| "learning_rate": 2.7574311634278872e-05, |
| "loss": 0.2583, |
| "num_input_tokens_seen": 719640, |
| "step": 5575 |
| }, |
| { |
| "epoch": 5.2100840336134455, |
| "grad_norm": 14.866647720336914, |
| "learning_rate": 2.753378415731412e-05, |
| "loss": 0.3697, |
| "num_input_tokens_seen": 720344, |
| "step": 5580 |
| }, |
| { |
| "epoch": 5.214752567693744, |
| "grad_norm": 6.337793827056885, |
| "learning_rate": 2.749324995143016e-05, |
| "loss": 0.1749, |
| "num_input_tokens_seen": 721064, |
| "step": 5585 |
| }, |
| { |
| "epoch": 5.219421101774043, |
| "grad_norm": 3.1200544834136963, |
| "learning_rate": 2.7452709124272863e-05, |
| "loss": 0.1373, |
| "num_input_tokens_seen": 721672, |
| "step": 5590 |
| }, |
| { |
| "epoch": 5.224089635854342, |
| "grad_norm": 5.817122936248779, |
| "learning_rate": 2.741216178350568e-05, |
| "loss": 0.2363, |
| "num_input_tokens_seen": 722280, |
| "step": 5595 |
| }, |
| { |
| "epoch": 5.228758169934641, |
| "grad_norm": 0.6314363479614258, |
| "learning_rate": 2.7371608036809364e-05, |
| "loss": 0.457, |
| "num_input_tokens_seen": 722920, |
| "step": 5600 |
| }, |
| { |
| "epoch": 5.233426704014939, |
| "grad_norm": 8.405885696411133, |
| "learning_rate": 2.733104799188168e-05, |
| "loss": 0.2979, |
| "num_input_tokens_seen": 723560, |
| "step": 5605 |
| }, |
| { |
| "epoch": 5.238095238095238, |
| "grad_norm": 5.43946647644043, |
| "learning_rate": 2.7290481756437112e-05, |
| "loss": 0.1767, |
| "num_input_tokens_seen": 724136, |
| "step": 5610 |
| }, |
| { |
| "epoch": 5.242763772175537, |
| "grad_norm": 0.9292370080947876, |
| "learning_rate": 2.724990943820659e-05, |
| "loss": 0.1797, |
| "num_input_tokens_seen": 724728, |
| "step": 5615 |
| }, |
| { |
| "epoch": 5.247432306255836, |
| "grad_norm": 3.5616211891174316, |
| "learning_rate": 2.72093311449372e-05, |
| "loss": 0.1392, |
| "num_input_tokens_seen": 725368, |
| "step": 5620 |
| }, |
| { |
| "epoch": 5.2521008403361344, |
| "grad_norm": 6.952834606170654, |
| "learning_rate": 2.716874698439189e-05, |
| "loss": 0.3549, |
| "num_input_tokens_seen": 726072, |
| "step": 5625 |
| }, |
| { |
| "epoch": 5.256769374416433, |
| "grad_norm": 3.403163433074951, |
| "learning_rate": 2.7128157064349186e-05, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 726744, |
| "step": 5630 |
| }, |
| { |
| "epoch": 5.261437908496732, |
| "grad_norm": 0.7634921669960022, |
| "learning_rate": 2.7087561492602925e-05, |
| "loss": 0.0984, |
| "num_input_tokens_seen": 727368, |
| "step": 5635 |
| }, |
| { |
| "epoch": 5.266106442577031, |
| "grad_norm": 0.6099923253059387, |
| "learning_rate": 2.7046960376961934e-05, |
| "loss": 0.6007, |
| "num_input_tokens_seen": 728008, |
| "step": 5640 |
| }, |
| { |
| "epoch": 5.2707749766573295, |
| "grad_norm": 2.206415891647339, |
| "learning_rate": 2.7006353825249792e-05, |
| "loss": 0.1308, |
| "num_input_tokens_seen": 728696, |
| "step": 5645 |
| }, |
| { |
| "epoch": 5.275443510737628, |
| "grad_norm": 6.221801280975342, |
| "learning_rate": 2.6965741945304467e-05, |
| "loss": 0.2882, |
| "num_input_tokens_seen": 729400, |
| "step": 5650 |
| }, |
| { |
| "epoch": 5.280112044817927, |
| "grad_norm": 3.225241184234619, |
| "learning_rate": 2.6925124844978126e-05, |
| "loss": 0.2439, |
| "num_input_tokens_seen": 730024, |
| "step": 5655 |
| }, |
| { |
| "epoch": 5.284780578898226, |
| "grad_norm": 8.841064453125, |
| "learning_rate": 2.6884502632136777e-05, |
| "loss": 0.1993, |
| "num_input_tokens_seen": 730664, |
| "step": 5660 |
| }, |
| { |
| "epoch": 5.289449112978525, |
| "grad_norm": 2.7826311588287354, |
| "learning_rate": 2.6843875414659996e-05, |
| "loss": 0.2607, |
| "num_input_tokens_seen": 731320, |
| "step": 5665 |
| }, |
| { |
| "epoch": 5.294117647058823, |
| "grad_norm": 4.322564601898193, |
| "learning_rate": 2.680324330044067e-05, |
| "loss": 0.1684, |
| "num_input_tokens_seen": 731992, |
| "step": 5670 |
| }, |
| { |
| "epoch": 5.298786181139122, |
| "grad_norm": 1.4055625200271606, |
| "learning_rate": 2.6762606397384677e-05, |
| "loss": 0.1244, |
| "num_input_tokens_seen": 732808, |
| "step": 5675 |
| }, |
| { |
| "epoch": 5.303454715219421, |
| "grad_norm": 3.7955009937286377, |
| "learning_rate": 2.6721964813410616e-05, |
| "loss": 0.2172, |
| "num_input_tokens_seen": 733448, |
| "step": 5680 |
| }, |
| { |
| "epoch": 5.30812324929972, |
| "grad_norm": 1.5904556512832642, |
| "learning_rate": 2.6681318656449522e-05, |
| "loss": 0.2895, |
| "num_input_tokens_seen": 734040, |
| "step": 5685 |
| }, |
| { |
| "epoch": 5.3127917833800185, |
| "grad_norm": 4.815072059631348, |
| "learning_rate": 2.664066803444456e-05, |
| "loss": 0.1865, |
| "num_input_tokens_seen": 734760, |
| "step": 5690 |
| }, |
| { |
| "epoch": 5.317460317460317, |
| "grad_norm": 11.906131744384766, |
| "learning_rate": 2.6600013055350776e-05, |
| "loss": 0.1374, |
| "num_input_tokens_seen": 735448, |
| "step": 5695 |
| }, |
| { |
| "epoch": 5.322128851540616, |
| "grad_norm": 2.0406405925750732, |
| "learning_rate": 2.6559353827134754e-05, |
| "loss": 0.1208, |
| "num_input_tokens_seen": 736104, |
| "step": 5700 |
| }, |
| { |
| "epoch": 5.326797385620915, |
| "grad_norm": 5.281521320343018, |
| "learning_rate": 2.651869045777441e-05, |
| "loss": 0.2245, |
| "num_input_tokens_seen": 736696, |
| "step": 5705 |
| }, |
| { |
| "epoch": 5.3314659197012135, |
| "grad_norm": 2.748307704925537, |
| "learning_rate": 2.6478023055258606e-05, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 737336, |
| "step": 5710 |
| }, |
| { |
| "epoch": 5.336134453781512, |
| "grad_norm": 18.87708854675293, |
| "learning_rate": 2.643735172758694e-05, |
| "loss": 0.2526, |
| "num_input_tokens_seen": 737976, |
| "step": 5715 |
| }, |
| { |
| "epoch": 5.340802987861811, |
| "grad_norm": 3.8325388431549072, |
| "learning_rate": 2.6396676582769447e-05, |
| "loss": 0.2278, |
| "num_input_tokens_seen": 738648, |
| "step": 5720 |
| }, |
| { |
| "epoch": 5.34547152194211, |
| "grad_norm": 4.2706990242004395, |
| "learning_rate": 2.6355997728826276e-05, |
| "loss": 0.0846, |
| "num_input_tokens_seen": 739384, |
| "step": 5725 |
| }, |
| { |
| "epoch": 5.350140056022409, |
| "grad_norm": 2.1158320903778076, |
| "learning_rate": 2.6315315273787428e-05, |
| "loss": 0.2136, |
| "num_input_tokens_seen": 739992, |
| "step": 5730 |
| }, |
| { |
| "epoch": 5.354808590102707, |
| "grad_norm": 3.025188684463501, |
| "learning_rate": 2.627462932569248e-05, |
| "loss": 0.3116, |
| "num_input_tokens_seen": 740824, |
| "step": 5735 |
| }, |
| { |
| "epoch": 5.359477124183006, |
| "grad_norm": 3.6472253799438477, |
| "learning_rate": 2.6233939992590277e-05, |
| "loss": 0.3267, |
| "num_input_tokens_seen": 741448, |
| "step": 5740 |
| }, |
| { |
| "epoch": 5.364145658263305, |
| "grad_norm": 1.7700618505477905, |
| "learning_rate": 2.619324738253867e-05, |
| "loss": 0.1885, |
| "num_input_tokens_seen": 742184, |
| "step": 5745 |
| }, |
| { |
| "epoch": 5.368814192343605, |
| "grad_norm": 0.3386409878730774, |
| "learning_rate": 2.6152551603604176e-05, |
| "loss": 0.1666, |
| "num_input_tokens_seen": 742808, |
| "step": 5750 |
| }, |
| { |
| "epoch": 5.373482726423903, |
| "grad_norm": 1.589062213897705, |
| "learning_rate": 2.611185276386176e-05, |
| "loss": 0.3356, |
| "num_input_tokens_seen": 743384, |
| "step": 5755 |
| }, |
| { |
| "epoch": 5.378151260504202, |
| "grad_norm": 3.3072431087493896, |
| "learning_rate": 2.6071150971394503e-05, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 743944, |
| "step": 5760 |
| }, |
| { |
| "epoch": 5.382819794584501, |
| "grad_norm": 12.216253280639648, |
| "learning_rate": 2.603044633429334e-05, |
| "loss": 0.5011, |
| "num_input_tokens_seen": 744520, |
| "step": 5765 |
| }, |
| { |
| "epoch": 5.3874883286648, |
| "grad_norm": 10.251158714294434, |
| "learning_rate": 2.598973896065674e-05, |
| "loss": 0.2908, |
| "num_input_tokens_seen": 745176, |
| "step": 5770 |
| }, |
| { |
| "epoch": 5.392156862745098, |
| "grad_norm": 2.105750560760498, |
| "learning_rate": 2.5949028958590447e-05, |
| "loss": 0.1305, |
| "num_input_tokens_seen": 745848, |
| "step": 5775 |
| }, |
| { |
| "epoch": 5.396825396825397, |
| "grad_norm": 3.5795063972473145, |
| "learning_rate": 2.5908316436207203e-05, |
| "loss": 0.198, |
| "num_input_tokens_seen": 746504, |
| "step": 5780 |
| }, |
| { |
| "epoch": 5.401493930905696, |
| "grad_norm": 7.272071361541748, |
| "learning_rate": 2.5867601501626415e-05, |
| "loss": 0.3211, |
| "num_input_tokens_seen": 747128, |
| "step": 5785 |
| }, |
| { |
| "epoch": 5.406162464985995, |
| "grad_norm": 3.216158866882324, |
| "learning_rate": 2.5826884262973906e-05, |
| "loss": 0.0894, |
| "num_input_tokens_seen": 747752, |
| "step": 5790 |
| }, |
| { |
| "epoch": 5.4108309990662935, |
| "grad_norm": 1.1698273420333862, |
| "learning_rate": 2.5786164828381633e-05, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 748296, |
| "step": 5795 |
| }, |
| { |
| "epoch": 5.415499533146592, |
| "grad_norm": 6.914623260498047, |
| "learning_rate": 2.5745443305987366e-05, |
| "loss": 0.2973, |
| "num_input_tokens_seen": 748920, |
| "step": 5800 |
| }, |
| { |
| "epoch": 5.420168067226891, |
| "grad_norm": 8.555301666259766, |
| "learning_rate": 2.570471980393443e-05, |
| "loss": 0.2134, |
| "num_input_tokens_seen": 749672, |
| "step": 5805 |
| }, |
| { |
| "epoch": 5.42483660130719, |
| "grad_norm": 3.0789635181427, |
| "learning_rate": 2.5663994430371403e-05, |
| "loss": 0.2423, |
| "num_input_tokens_seen": 750344, |
| "step": 5810 |
| }, |
| { |
| "epoch": 5.429505135387489, |
| "grad_norm": 19.087554931640625, |
| "learning_rate": 2.5623267293451826e-05, |
| "loss": 0.1097, |
| "num_input_tokens_seen": 751000, |
| "step": 5815 |
| }, |
| { |
| "epoch": 5.434173669467787, |
| "grad_norm": 10.77205753326416, |
| "learning_rate": 2.5582538501333934e-05, |
| "loss": 0.2338, |
| "num_input_tokens_seen": 751752, |
| "step": 5820 |
| }, |
| { |
| "epoch": 5.438842203548086, |
| "grad_norm": 1.9925490617752075, |
| "learning_rate": 2.5541808162180364e-05, |
| "loss": 0.1922, |
| "num_input_tokens_seen": 752392, |
| "step": 5825 |
| }, |
| { |
| "epoch": 5.443510737628385, |
| "grad_norm": 11.317987442016602, |
| "learning_rate": 2.5501076384157848e-05, |
| "loss": 0.323, |
| "num_input_tokens_seen": 753112, |
| "step": 5830 |
| }, |
| { |
| "epoch": 5.448179271708684, |
| "grad_norm": 6.738025188446045, |
| "learning_rate": 2.5460343275436925e-05, |
| "loss": 0.1361, |
| "num_input_tokens_seen": 753704, |
| "step": 5835 |
| }, |
| { |
| "epoch": 5.452847805788982, |
| "grad_norm": 6.475763320922852, |
| "learning_rate": 2.5419608944191714e-05, |
| "loss": 0.2934, |
| "num_input_tokens_seen": 754344, |
| "step": 5840 |
| }, |
| { |
| "epoch": 5.457516339869281, |
| "grad_norm": 0.0828949436545372, |
| "learning_rate": 2.5378873498599535e-05, |
| "loss": 0.2367, |
| "num_input_tokens_seen": 755128, |
| "step": 5845 |
| }, |
| { |
| "epoch": 5.46218487394958, |
| "grad_norm": 0.22640591859817505, |
| "learning_rate": 2.5338137046840687e-05, |
| "loss": 0.1342, |
| "num_input_tokens_seen": 755800, |
| "step": 5850 |
| }, |
| { |
| "epoch": 5.466853408029879, |
| "grad_norm": 6.038141250610352, |
| "learning_rate": 2.529739969709814e-05, |
| "loss": 0.1208, |
| "num_input_tokens_seen": 756520, |
| "step": 5855 |
| }, |
| { |
| "epoch": 5.4715219421101775, |
| "grad_norm": 12.068619728088379, |
| "learning_rate": 2.5256661557557247e-05, |
| "loss": 0.5476, |
| "num_input_tokens_seen": 757128, |
| "step": 5860 |
| }, |
| { |
| "epoch": 5.476190476190476, |
| "grad_norm": 8.999595642089844, |
| "learning_rate": 2.5215922736405468e-05, |
| "loss": 0.1985, |
| "num_input_tokens_seen": 757864, |
| "step": 5865 |
| }, |
| { |
| "epoch": 5.480859010270775, |
| "grad_norm": 6.384941101074219, |
| "learning_rate": 2.5175183341832048e-05, |
| "loss": 0.3681, |
| "num_input_tokens_seen": 758472, |
| "step": 5870 |
| }, |
| { |
| "epoch": 5.485527544351074, |
| "grad_norm": 4.548037052154541, |
| "learning_rate": 2.513444348202778e-05, |
| "loss": 0.1332, |
| "num_input_tokens_seen": 759032, |
| "step": 5875 |
| }, |
| { |
| "epoch": 5.490196078431373, |
| "grad_norm": 3.7748093605041504, |
| "learning_rate": 2.5093703265184686e-05, |
| "loss": 0.1326, |
| "num_input_tokens_seen": 759624, |
| "step": 5880 |
| }, |
| { |
| "epoch": 5.494864612511671, |
| "grad_norm": 3.9037418365478516, |
| "learning_rate": 2.505296279949574e-05, |
| "loss": 0.1086, |
| "num_input_tokens_seen": 760264, |
| "step": 5885 |
| }, |
| { |
| "epoch": 5.49953314659197, |
| "grad_norm": 0.08623213320970535, |
| "learning_rate": 2.5012222193154548e-05, |
| "loss": 0.1744, |
| "num_input_tokens_seen": 761192, |
| "step": 5890 |
| }, |
| { |
| "epoch": 5.504201680672269, |
| "grad_norm": 7.6671013832092285, |
| "learning_rate": 2.4971481554355133e-05, |
| "loss": 0.2383, |
| "num_input_tokens_seen": 761864, |
| "step": 5895 |
| }, |
| { |
| "epoch": 5.505135387488329, |
| "eval_loss": 0.8534978032112122, |
| "eval_runtime": 3.8678, |
| "eval_samples_per_second": 61.533, |
| "eval_steps_per_second": 30.766, |
| "num_input_tokens_seen": 762008, |
| "step": 5896 |
| }, |
| { |
| "epoch": 5.508870214752568, |
| "grad_norm": 5.7819647789001465, |
| "learning_rate": 2.4930740991291567e-05, |
| "loss": 0.208, |
| "num_input_tokens_seen": 762552, |
| "step": 5900 |
| }, |
| { |
| "epoch": 5.513538748832866, |
| "grad_norm": 4.081792831420898, |
| "learning_rate": 2.489000061215775e-05, |
| "loss": 0.1272, |
| "num_input_tokens_seen": 763096, |
| "step": 5905 |
| }, |
| { |
| "epoch": 5.518207282913165, |
| "grad_norm": 7.347453594207764, |
| "learning_rate": 2.4849260525147078e-05, |
| "loss": 0.4156, |
| "num_input_tokens_seen": 763768, |
| "step": 5910 |
| }, |
| { |
| "epoch": 5.522875816993464, |
| "grad_norm": 10.584314346313477, |
| "learning_rate": 2.4808520838452168e-05, |
| "loss": 0.3707, |
| "num_input_tokens_seen": 764344, |
| "step": 5915 |
| }, |
| { |
| "epoch": 5.527544351073763, |
| "grad_norm": 10.097419738769531, |
| "learning_rate": 2.4767781660264596e-05, |
| "loss": 0.2649, |
| "num_input_tokens_seen": 764904, |
| "step": 5920 |
| }, |
| { |
| "epoch": 5.5322128851540615, |
| "grad_norm": 4.417219161987305, |
| "learning_rate": 2.4727043098774548e-05, |
| "loss": 0.1982, |
| "num_input_tokens_seen": 765512, |
| "step": 5925 |
| }, |
| { |
| "epoch": 5.53688141923436, |
| "grad_norm": 2.841074228286743, |
| "learning_rate": 2.4686305262170617e-05, |
| "loss": 0.4358, |
| "num_input_tokens_seen": 766072, |
| "step": 5930 |
| }, |
| { |
| "epoch": 5.541549953314659, |
| "grad_norm": 7.550325393676758, |
| "learning_rate": 2.4645568258639433e-05, |
| "loss": 0.2835, |
| "num_input_tokens_seen": 766728, |
| "step": 5935 |
| }, |
| { |
| "epoch": 5.546218487394958, |
| "grad_norm": 7.056575298309326, |
| "learning_rate": 2.4604832196365435e-05, |
| "loss": 0.2372, |
| "num_input_tokens_seen": 767336, |
| "step": 5940 |
| }, |
| { |
| "epoch": 5.550887021475257, |
| "grad_norm": 3.2988297939300537, |
| "learning_rate": 2.4564097183530572e-05, |
| "loss": 0.2944, |
| "num_input_tokens_seen": 768008, |
| "step": 5945 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 7.315042018890381, |
| "learning_rate": 2.4523363328313974e-05, |
| "loss": 0.1725, |
| "num_input_tokens_seen": 768664, |
| "step": 5950 |
| }, |
| { |
| "epoch": 5.560224089635854, |
| "grad_norm": 21.145261764526367, |
| "learning_rate": 2.4482630738891713e-05, |
| "loss": 0.2216, |
| "num_input_tokens_seen": 769208, |
| "step": 5955 |
| }, |
| { |
| "epoch": 5.564892623716153, |
| "grad_norm": 2.414390802383423, |
| "learning_rate": 2.444189952343651e-05, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 769864, |
| "step": 5960 |
| }, |
| { |
| "epoch": 5.569561157796452, |
| "grad_norm": 6.385523319244385, |
| "learning_rate": 2.4401169790117427e-05, |
| "loss": 0.1384, |
| "num_input_tokens_seen": 770472, |
| "step": 5965 |
| }, |
| { |
| "epoch": 5.57422969187675, |
| "grad_norm": 4.405248165130615, |
| "learning_rate": 2.4360441647099592e-05, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 771128, |
| "step": 5970 |
| }, |
| { |
| "epoch": 5.578898225957049, |
| "grad_norm": 10.55893611907959, |
| "learning_rate": 2.4319715202543905e-05, |
| "loss": 0.2984, |
| "num_input_tokens_seen": 771784, |
| "step": 5975 |
| }, |
| { |
| "epoch": 5.583566760037348, |
| "grad_norm": 8.2843017578125, |
| "learning_rate": 2.4278990564606753e-05, |
| "loss": 0.1171, |
| "num_input_tokens_seen": 772472, |
| "step": 5980 |
| }, |
| { |
| "epoch": 5.588235294117647, |
| "grad_norm": 0.031208815053105354, |
| "learning_rate": 2.423826784143974e-05, |
| "loss": 0.1669, |
| "num_input_tokens_seen": 773192, |
| "step": 5985 |
| }, |
| { |
| "epoch": 5.5929038281979455, |
| "grad_norm": 14.423694610595703, |
| "learning_rate": 2.419754714118938e-05, |
| "loss": 0.2992, |
| "num_input_tokens_seen": 773800, |
| "step": 5990 |
| }, |
| { |
| "epoch": 5.597572362278244, |
| "grad_norm": 0.023451492190361023, |
| "learning_rate": 2.4156828571996808e-05, |
| "loss": 0.2251, |
| "num_input_tokens_seen": 774472, |
| "step": 5995 |
| }, |
| { |
| "epoch": 5.602240896358543, |
| "grad_norm": 3.960092306137085, |
| "learning_rate": 2.4116112241997486e-05, |
| "loss": 0.1467, |
| "num_input_tokens_seen": 775080, |
| "step": 6000 |
| }, |
| { |
| "epoch": 5.606909430438842, |
| "grad_norm": 4.254623889923096, |
| "learning_rate": 2.4075398259320964e-05, |
| "loss": 0.2805, |
| "num_input_tokens_seen": 775704, |
| "step": 6005 |
| }, |
| { |
| "epoch": 5.611577964519141, |
| "grad_norm": 2.3658244609832764, |
| "learning_rate": 2.403468673209054e-05, |
| "loss": 0.1634, |
| "num_input_tokens_seen": 776328, |
| "step": 6010 |
| }, |
| { |
| "epoch": 5.616246498599439, |
| "grad_norm": 5.901800632476807, |
| "learning_rate": 2.399397776842298e-05, |
| "loss": 0.4695, |
| "num_input_tokens_seen": 776936, |
| "step": 6015 |
| }, |
| { |
| "epoch": 5.620915032679738, |
| "grad_norm": 4.022805690765381, |
| "learning_rate": 2.3953271476428268e-05, |
| "loss": 0.1232, |
| "num_input_tokens_seen": 777576, |
| "step": 6020 |
| }, |
| { |
| "epoch": 5.625583566760037, |
| "grad_norm": 14.644888877868652, |
| "learning_rate": 2.3912567964209264e-05, |
| "loss": 0.3136, |
| "num_input_tokens_seen": 778168, |
| "step": 6025 |
| }, |
| { |
| "epoch": 5.630252100840336, |
| "grad_norm": 6.010149002075195, |
| "learning_rate": 2.387186733986147e-05, |
| "loss": 0.2141, |
| "num_input_tokens_seen": 778792, |
| "step": 6030 |
| }, |
| { |
| "epoch": 5.634920634920634, |
| "grad_norm": 4.33161735534668, |
| "learning_rate": 2.38311697114727e-05, |
| "loss": 0.2491, |
| "num_input_tokens_seen": 779448, |
| "step": 6035 |
| }, |
| { |
| "epoch": 5.639589169000933, |
| "grad_norm": 6.012837886810303, |
| "learning_rate": 2.3790475187122836e-05, |
| "loss": 0.2456, |
| "num_input_tokens_seen": 780168, |
| "step": 6040 |
| }, |
| { |
| "epoch": 5.644257703081233, |
| "grad_norm": 0.5816524624824524, |
| "learning_rate": 2.374978387488348e-05, |
| "loss": 0.2491, |
| "num_input_tokens_seen": 780728, |
| "step": 6045 |
| }, |
| { |
| "epoch": 5.648926237161532, |
| "grad_norm": 14.974713325500488, |
| "learning_rate": 2.3709095882817737e-05, |
| "loss": 0.2709, |
| "num_input_tokens_seen": 781368, |
| "step": 6050 |
| }, |
| { |
| "epoch": 5.65359477124183, |
| "grad_norm": 8.99953842163086, |
| "learning_rate": 2.3668411318979884e-05, |
| "loss": 0.3081, |
| "num_input_tokens_seen": 782072, |
| "step": 6055 |
| }, |
| { |
| "epoch": 5.658263305322129, |
| "grad_norm": 1.4049012660980225, |
| "learning_rate": 2.362773029141508e-05, |
| "loss": 0.1005, |
| "num_input_tokens_seen": 782744, |
| "step": 6060 |
| }, |
| { |
| "epoch": 5.662931839402428, |
| "grad_norm": 2.7593705654144287, |
| "learning_rate": 2.358705290815913e-05, |
| "loss": 0.1774, |
| "num_input_tokens_seen": 783384, |
| "step": 6065 |
| }, |
| { |
| "epoch": 5.667600373482727, |
| "grad_norm": 8.04900074005127, |
| "learning_rate": 2.3546379277238107e-05, |
| "loss": 0.1899, |
| "num_input_tokens_seen": 784008, |
| "step": 6070 |
| }, |
| { |
| "epoch": 5.6722689075630255, |
| "grad_norm": 5.208436012268066, |
| "learning_rate": 2.3505709506668165e-05, |
| "loss": 0.1833, |
| "num_input_tokens_seen": 784648, |
| "step": 6075 |
| }, |
| { |
| "epoch": 5.676937441643324, |
| "grad_norm": 3.1610107421875, |
| "learning_rate": 2.3465043704455182e-05, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 785400, |
| "step": 6080 |
| }, |
| { |
| "epoch": 5.681605975723623, |
| "grad_norm": 6.8983988761901855, |
| "learning_rate": 2.3424381978594505e-05, |
| "loss": 0.0653, |
| "num_input_tokens_seen": 786056, |
| "step": 6085 |
| }, |
| { |
| "epoch": 5.686274509803922, |
| "grad_norm": 14.282424926757812, |
| "learning_rate": 2.3383724437070668e-05, |
| "loss": 0.1803, |
| "num_input_tokens_seen": 786648, |
| "step": 6090 |
| }, |
| { |
| "epoch": 5.690943043884221, |
| "grad_norm": 12.445168495178223, |
| "learning_rate": 2.3343071187857062e-05, |
| "loss": 0.1965, |
| "num_input_tokens_seen": 787272, |
| "step": 6095 |
| }, |
| { |
| "epoch": 5.695611577964519, |
| "grad_norm": 2.3133530616760254, |
| "learning_rate": 2.3302422338915696e-05, |
| "loss": 0.2115, |
| "num_input_tokens_seen": 787896, |
| "step": 6100 |
| }, |
| { |
| "epoch": 5.700280112044818, |
| "grad_norm": 2.805271863937378, |
| "learning_rate": 2.3261777998196905e-05, |
| "loss": 0.1056, |
| "num_input_tokens_seen": 788504, |
| "step": 6105 |
| }, |
| { |
| "epoch": 5.704948646125117, |
| "grad_norm": 4.6447248458862305, |
| "learning_rate": 2.322113827363904e-05, |
| "loss": 0.1847, |
| "num_input_tokens_seen": 789112, |
| "step": 6110 |
| }, |
| { |
| "epoch": 5.709617180205416, |
| "grad_norm": 8.13668155670166, |
| "learning_rate": 2.3180503273168194e-05, |
| "loss": 0.1741, |
| "num_input_tokens_seen": 789832, |
| "step": 6115 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 2.305729389190674, |
| "learning_rate": 2.31398731046979e-05, |
| "loss": 0.3343, |
| "num_input_tokens_seen": 790424, |
| "step": 6120 |
| }, |
| { |
| "epoch": 5.718954248366013, |
| "grad_norm": 2.2352488040924072, |
| "learning_rate": 2.3099247876128877e-05, |
| "loss": 0.154, |
| "num_input_tokens_seen": 791128, |
| "step": 6125 |
| }, |
| { |
| "epoch": 5.723622782446312, |
| "grad_norm": 10.044025421142578, |
| "learning_rate": 2.3058627695348737e-05, |
| "loss": 0.2203, |
| "num_input_tokens_seen": 791896, |
| "step": 6130 |
| }, |
| { |
| "epoch": 5.728291316526611, |
| "grad_norm": 9.180370330810547, |
| "learning_rate": 2.3018012670231647e-05, |
| "loss": 0.131, |
| "num_input_tokens_seen": 792520, |
| "step": 6135 |
| }, |
| { |
| "epoch": 5.7329598506069095, |
| "grad_norm": 5.549483776092529, |
| "learning_rate": 2.2977402908638135e-05, |
| "loss": 0.1664, |
| "num_input_tokens_seen": 793192, |
| "step": 6140 |
| }, |
| { |
| "epoch": 5.737628384687208, |
| "grad_norm": 10.082823753356934, |
| "learning_rate": 2.2936798518414686e-05, |
| "loss": 0.2269, |
| "num_input_tokens_seen": 793816, |
| "step": 6145 |
| }, |
| { |
| "epoch": 5.742296918767507, |
| "grad_norm": 4.678831577301025, |
| "learning_rate": 2.2896199607393577e-05, |
| "loss": 0.1145, |
| "num_input_tokens_seen": 794520, |
| "step": 6150 |
| }, |
| { |
| "epoch": 5.746965452847806, |
| "grad_norm": 7.657114505767822, |
| "learning_rate": 2.2855606283392516e-05, |
| "loss": 0.1094, |
| "num_input_tokens_seen": 795176, |
| "step": 6155 |
| }, |
| { |
| "epoch": 5.751633986928105, |
| "grad_norm": 20.42408561706543, |
| "learning_rate": 2.281501865421436e-05, |
| "loss": 0.2962, |
| "num_input_tokens_seen": 795768, |
| "step": 6160 |
| }, |
| { |
| "epoch": 5.756302521008403, |
| "grad_norm": 5.679710388183594, |
| "learning_rate": 2.2774436827646865e-05, |
| "loss": 0.392, |
| "num_input_tokens_seen": 796408, |
| "step": 6165 |
| }, |
| { |
| "epoch": 5.760971055088702, |
| "grad_norm": 6.150409698486328, |
| "learning_rate": 2.2733860911462342e-05, |
| "loss": 0.2671, |
| "num_input_tokens_seen": 797048, |
| "step": 6170 |
| }, |
| { |
| "epoch": 5.765639589169001, |
| "grad_norm": 11.56528377532959, |
| "learning_rate": 2.2693291013417453e-05, |
| "loss": 0.2104, |
| "num_input_tokens_seen": 797672, |
| "step": 6175 |
| }, |
| { |
| "epoch": 5.7703081232493, |
| "grad_norm": 4.323744297027588, |
| "learning_rate": 2.265272724125284e-05, |
| "loss": 0.2423, |
| "num_input_tokens_seen": 798312, |
| "step": 6180 |
| }, |
| { |
| "epoch": 5.774976657329598, |
| "grad_norm": 6.345007419586182, |
| "learning_rate": 2.2612169702692887e-05, |
| "loss": 0.6142, |
| "num_input_tokens_seen": 798936, |
| "step": 6185 |
| }, |
| { |
| "epoch": 5.779645191409897, |
| "grad_norm": 7.959813117980957, |
| "learning_rate": 2.257161850544545e-05, |
| "loss": 0.3423, |
| "num_input_tokens_seen": 799496, |
| "step": 6190 |
| }, |
| { |
| "epoch": 5.784313725490196, |
| "grad_norm": 4.817144393920898, |
| "learning_rate": 2.25310737572015e-05, |
| "loss": 0.4931, |
| "num_input_tokens_seen": 800088, |
| "step": 6195 |
| }, |
| { |
| "epoch": 5.788982259570495, |
| "grad_norm": 3.499696969985962, |
| "learning_rate": 2.2490535565634897e-05, |
| "loss": 0.2209, |
| "num_input_tokens_seen": 800728, |
| "step": 6200 |
| }, |
| { |
| "epoch": 5.7936507936507935, |
| "grad_norm": 5.070218086242676, |
| "learning_rate": 2.2450004038402107e-05, |
| "loss": 0.1922, |
| "num_input_tokens_seen": 801272, |
| "step": 6205 |
| }, |
| { |
| "epoch": 5.798319327731092, |
| "grad_norm": 2.386444330215454, |
| "learning_rate": 2.2409479283141886e-05, |
| "loss": 0.1461, |
| "num_input_tokens_seen": 801992, |
| "step": 6210 |
| }, |
| { |
| "epoch": 5.802987861811391, |
| "grad_norm": 22.35248374938965, |
| "learning_rate": 2.236896140747501e-05, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 802664, |
| "step": 6215 |
| }, |
| { |
| "epoch": 5.80765639589169, |
| "grad_norm": 12.163101196289062, |
| "learning_rate": 2.2328450519003963e-05, |
| "loss": 0.1354, |
| "num_input_tokens_seen": 803288, |
| "step": 6220 |
| }, |
| { |
| "epoch": 5.812324929971989, |
| "grad_norm": 9.302704811096191, |
| "learning_rate": 2.2287946725312693e-05, |
| "loss": 0.2459, |
| "num_input_tokens_seen": 803928, |
| "step": 6225 |
| }, |
| { |
| "epoch": 5.816993464052287, |
| "grad_norm": 1.620877742767334, |
| "learning_rate": 2.2247450133966317e-05, |
| "loss": 0.264, |
| "num_input_tokens_seen": 804568, |
| "step": 6230 |
| }, |
| { |
| "epoch": 5.821661998132586, |
| "grad_norm": 2.096247911453247, |
| "learning_rate": 2.2206960852510804e-05, |
| "loss": 0.1366, |
| "num_input_tokens_seen": 805272, |
| "step": 6235 |
| }, |
| { |
| "epoch": 5.826330532212885, |
| "grad_norm": 5.8347578048706055, |
| "learning_rate": 2.2166478988472716e-05, |
| "loss": 0.3308, |
| "num_input_tokens_seen": 805832, |
| "step": 6240 |
| }, |
| { |
| "epoch": 5.830999066293184, |
| "grad_norm": 5.4447550773620605, |
| "learning_rate": 2.2126004649358916e-05, |
| "loss": 0.1452, |
| "num_input_tokens_seen": 806440, |
| "step": 6245 |
| }, |
| { |
| "epoch": 5.835667600373482, |
| "grad_norm": 5.139708042144775, |
| "learning_rate": 2.2085537942656287e-05, |
| "loss": 0.1665, |
| "num_input_tokens_seen": 807112, |
| "step": 6250 |
| }, |
| { |
| "epoch": 5.840336134453781, |
| "grad_norm": 3.3073081970214844, |
| "learning_rate": 2.2045078975831452e-05, |
| "loss": 0.2967, |
| "num_input_tokens_seen": 807672, |
| "step": 6255 |
| }, |
| { |
| "epoch": 5.84500466853408, |
| "grad_norm": 7.739352226257324, |
| "learning_rate": 2.2004627856330462e-05, |
| "loss": 0.1503, |
| "num_input_tokens_seen": 808344, |
| "step": 6260 |
| }, |
| { |
| "epoch": 5.849673202614379, |
| "grad_norm": 2.465211868286133, |
| "learning_rate": 2.196418469157852e-05, |
| "loss": 0.1154, |
| "num_input_tokens_seen": 809144, |
| "step": 6265 |
| }, |
| { |
| "epoch": 5.8543417366946775, |
| "grad_norm": 3.8778762817382812, |
| "learning_rate": 2.1923749588979737e-05, |
| "loss": 0.2426, |
| "num_input_tokens_seen": 809816, |
| "step": 6270 |
| }, |
| { |
| "epoch": 5.859010270774976, |
| "grad_norm": 13.767936706542969, |
| "learning_rate": 2.1883322655916793e-05, |
| "loss": 0.2215, |
| "num_input_tokens_seen": 810520, |
| "step": 6275 |
| }, |
| { |
| "epoch": 5.863678804855276, |
| "grad_norm": 25.827281951904297, |
| "learning_rate": 2.1842903999750665e-05, |
| "loss": 0.4355, |
| "num_input_tokens_seen": 811080, |
| "step": 6280 |
| }, |
| { |
| "epoch": 5.868347338935575, |
| "grad_norm": 2.462376117706299, |
| "learning_rate": 2.180249372782038e-05, |
| "loss": 0.2118, |
| "num_input_tokens_seen": 811704, |
| "step": 6285 |
| }, |
| { |
| "epoch": 5.8730158730158735, |
| "grad_norm": 0.8857133388519287, |
| "learning_rate": 2.1762091947442643e-05, |
| "loss": 0.2445, |
| "num_input_tokens_seen": 812312, |
| "step": 6290 |
| }, |
| { |
| "epoch": 5.877684407096172, |
| "grad_norm": 3.9621355533599854, |
| "learning_rate": 2.1721698765911674e-05, |
| "loss": 0.1307, |
| "num_input_tokens_seen": 812984, |
| "step": 6295 |
| }, |
| { |
| "epoch": 5.882352941176471, |
| "grad_norm": 3.7414557933807373, |
| "learning_rate": 2.1681314290498806e-05, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 813768, |
| "step": 6300 |
| }, |
| { |
| "epoch": 5.88702147525677, |
| "grad_norm": 2.700284719467163, |
| "learning_rate": 2.164093862845228e-05, |
| "loss": 0.3214, |
| "num_input_tokens_seen": 814424, |
| "step": 6305 |
| }, |
| { |
| "epoch": 5.8916900093370685, |
| "grad_norm": 4.355251789093018, |
| "learning_rate": 2.1600571886996933e-05, |
| "loss": 0.2542, |
| "num_input_tokens_seen": 815000, |
| "step": 6310 |
| }, |
| { |
| "epoch": 5.896358543417367, |
| "grad_norm": 0.26216188073158264, |
| "learning_rate": 2.156021417333388e-05, |
| "loss": 0.1463, |
| "num_input_tokens_seen": 815624, |
| "step": 6315 |
| }, |
| { |
| "epoch": 5.901027077497666, |
| "grad_norm": 5.650294780731201, |
| "learning_rate": 2.1519865594640302e-05, |
| "loss": 0.1264, |
| "num_input_tokens_seen": 816248, |
| "step": 6320 |
| }, |
| { |
| "epoch": 5.905695611577965, |
| "grad_norm": 4.275903224945068, |
| "learning_rate": 2.1479526258069087e-05, |
| "loss": 0.0928, |
| "num_input_tokens_seen": 816808, |
| "step": 6325 |
| }, |
| { |
| "epoch": 5.910364145658264, |
| "grad_norm": 6.359528064727783, |
| "learning_rate": 2.1439196270748598e-05, |
| "loss": 0.2651, |
| "num_input_tokens_seen": 817416, |
| "step": 6330 |
| }, |
| { |
| "epoch": 5.915032679738562, |
| "grad_norm": 3.161203145980835, |
| "learning_rate": 2.139887573978238e-05, |
| "loss": 0.0926, |
| "num_input_tokens_seen": 818104, |
| "step": 6335 |
| }, |
| { |
| "epoch": 5.919701213818861, |
| "grad_norm": 7.976051330566406, |
| "learning_rate": 2.1358564772248826e-05, |
| "loss": 0.2749, |
| "num_input_tokens_seen": 818744, |
| "step": 6340 |
| }, |
| { |
| "epoch": 5.92436974789916, |
| "grad_norm": 9.922679901123047, |
| "learning_rate": 2.131826347520096e-05, |
| "loss": 0.3568, |
| "num_input_tokens_seen": 819368, |
| "step": 6345 |
| }, |
| { |
| "epoch": 5.929038281979459, |
| "grad_norm": 2.597715377807617, |
| "learning_rate": 2.1277971955666125e-05, |
| "loss": 0.2073, |
| "num_input_tokens_seen": 820040, |
| "step": 6350 |
| }, |
| { |
| "epoch": 5.9337068160597575, |
| "grad_norm": 16.03790855407715, |
| "learning_rate": 2.1237690320645695e-05, |
| "loss": 0.2687, |
| "num_input_tokens_seen": 820664, |
| "step": 6355 |
| }, |
| { |
| "epoch": 5.938375350140056, |
| "grad_norm": 2.43839168548584, |
| "learning_rate": 2.1197418677114795e-05, |
| "loss": 0.257, |
| "num_input_tokens_seen": 821304, |
| "step": 6360 |
| }, |
| { |
| "epoch": 5.943043884220355, |
| "grad_norm": 5.4671220779418945, |
| "learning_rate": 2.1157157132021994e-05, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 821928, |
| "step": 6365 |
| }, |
| { |
| "epoch": 5.947712418300654, |
| "grad_norm": 3.340067148208618, |
| "learning_rate": 2.1116905792289067e-05, |
| "loss": 0.3771, |
| "num_input_tokens_seen": 822536, |
| "step": 6370 |
| }, |
| { |
| "epoch": 5.9523809523809526, |
| "grad_norm": 8.478741645812988, |
| "learning_rate": 2.1076664764810693e-05, |
| "loss": 0.2832, |
| "num_input_tokens_seen": 823176, |
| "step": 6375 |
| }, |
| { |
| "epoch": 5.957049486461251, |
| "grad_norm": 8.456277847290039, |
| "learning_rate": 2.103643415645414e-05, |
| "loss": 0.2786, |
| "num_input_tokens_seen": 823864, |
| "step": 6380 |
| }, |
| { |
| "epoch": 5.96171802054155, |
| "grad_norm": 2.0959372520446777, |
| "learning_rate": 2.0996214074059034e-05, |
| "loss": 0.2735, |
| "num_input_tokens_seen": 824600, |
| "step": 6385 |
| }, |
| { |
| "epoch": 5.966386554621849, |
| "grad_norm": 1.7711430788040161, |
| "learning_rate": 2.0956004624437014e-05, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 825192, |
| "step": 6390 |
| }, |
| { |
| "epoch": 5.971055088702148, |
| "grad_norm": 8.200021743774414, |
| "learning_rate": 2.091580591437151e-05, |
| "loss": 0.1611, |
| "num_input_tokens_seen": 825800, |
| "step": 6395 |
| }, |
| { |
| "epoch": 5.975723622782446, |
| "grad_norm": 1.9257704019546509, |
| "learning_rate": 2.087561805061741e-05, |
| "loss": 0.3492, |
| "num_input_tokens_seen": 826536, |
| "step": 6400 |
| }, |
| { |
| "epoch": 5.980392156862745, |
| "grad_norm": 11.712030410766602, |
| "learning_rate": 2.0835441139900836e-05, |
| "loss": 0.2512, |
| "num_input_tokens_seen": 827176, |
| "step": 6405 |
| }, |
| { |
| "epoch": 5.985060690943044, |
| "grad_norm": 8.402978897094727, |
| "learning_rate": 2.0795275288918763e-05, |
| "loss": 0.4235, |
| "num_input_tokens_seen": 827832, |
| "step": 6410 |
| }, |
| { |
| "epoch": 5.989729225023343, |
| "grad_norm": 5.503647804260254, |
| "learning_rate": 2.075512060433884e-05, |
| "loss": 0.2737, |
| "num_input_tokens_seen": 828504, |
| "step": 6415 |
| }, |
| { |
| "epoch": 5.9943977591036415, |
| "grad_norm": 3.2183749675750732, |
| "learning_rate": 2.0714977192799055e-05, |
| "loss": 0.1136, |
| "num_input_tokens_seen": 829176, |
| "step": 6420 |
| }, |
| { |
| "epoch": 5.99906629318394, |
| "grad_norm": 4.965052604675293, |
| "learning_rate": 2.067484516090744e-05, |
| "loss": 0.2705, |
| "num_input_tokens_seen": 829800, |
| "step": 6425 |
| }, |
| { |
| "epoch": 6.003734827264239, |
| "grad_norm": 2.7685673236846924, |
| "learning_rate": 2.063472461524184e-05, |
| "loss": 0.1354, |
| "num_input_tokens_seen": 830456, |
| "step": 6430 |
| }, |
| { |
| "epoch": 6.0056022408963585, |
| "eval_loss": 0.8559670448303223, |
| "eval_runtime": 3.8802, |
| "eval_samples_per_second": 61.337, |
| "eval_steps_per_second": 30.669, |
| "num_input_tokens_seen": 830744, |
| "step": 6432 |
| }, |
| { |
| "epoch": 6.008403361344538, |
| "grad_norm": 1.8242768049240112, |
| "learning_rate": 2.059461566234954e-05, |
| "loss": 0.1321, |
| "num_input_tokens_seen": 831144, |
| "step": 6435 |
| }, |
| { |
| "epoch": 6.0130718954248366, |
| "grad_norm": 1.6450865268707275, |
| "learning_rate": 2.0554518408747103e-05, |
| "loss": 0.0649, |
| "num_input_tokens_seen": 831768, |
| "step": 6440 |
| }, |
| { |
| "epoch": 6.017740429505135, |
| "grad_norm": 41.15055465698242, |
| "learning_rate": 2.051443296091998e-05, |
| "loss": 0.1878, |
| "num_input_tokens_seen": 832408, |
| "step": 6445 |
| }, |
| { |
| "epoch": 6.022408963585434, |
| "grad_norm": 1.7977114915847778, |
| "learning_rate": 2.0474359425322276e-05, |
| "loss": 0.2574, |
| "num_input_tokens_seen": 833096, |
| "step": 6450 |
| }, |
| { |
| "epoch": 6.027077497665733, |
| "grad_norm": 0.9406064748764038, |
| "learning_rate": 2.0434297908376486e-05, |
| "loss": 0.1256, |
| "num_input_tokens_seen": 833736, |
| "step": 6455 |
| }, |
| { |
| "epoch": 6.031746031746032, |
| "grad_norm": 1.812961220741272, |
| "learning_rate": 2.0394248516473156e-05, |
| "loss": 0.0935, |
| "num_input_tokens_seen": 834600, |
| "step": 6460 |
| }, |
| { |
| "epoch": 6.03641456582633, |
| "grad_norm": 8.429821968078613, |
| "learning_rate": 2.0354211355970633e-05, |
| "loss": 0.1205, |
| "num_input_tokens_seen": 835176, |
| "step": 6465 |
| }, |
| { |
| "epoch": 6.041083099906629, |
| "grad_norm": 7.432557106018066, |
| "learning_rate": 2.0314186533194807e-05, |
| "loss": 0.2049, |
| "num_input_tokens_seen": 835864, |
| "step": 6470 |
| }, |
| { |
| "epoch": 6.045751633986928, |
| "grad_norm": 8.452810287475586, |
| "learning_rate": 2.0274174154438787e-05, |
| "loss": 0.1331, |
| "num_input_tokens_seen": 836536, |
| "step": 6475 |
| }, |
| { |
| "epoch": 6.050420168067227, |
| "grad_norm": 1.3018027544021606, |
| "learning_rate": 2.0234174325962638e-05, |
| "loss": 0.0948, |
| "num_input_tokens_seen": 837320, |
| "step": 6480 |
| }, |
| { |
| "epoch": 6.0550887021475255, |
| "grad_norm": 5.2079057693481445, |
| "learning_rate": 2.0194187153993085e-05, |
| "loss": 0.0504, |
| "num_input_tokens_seen": 837912, |
| "step": 6485 |
| }, |
| { |
| "epoch": 6.059757236227824, |
| "grad_norm": 2.5622472763061523, |
| "learning_rate": 2.015421274472325e-05, |
| "loss": 0.0902, |
| "num_input_tokens_seen": 838584, |
| "step": 6490 |
| }, |
| { |
| "epoch": 6.064425770308123, |
| "grad_norm": 3.108729600906372, |
| "learning_rate": 2.0114251204312367e-05, |
| "loss": 0.1672, |
| "num_input_tokens_seen": 839192, |
| "step": 6495 |
| }, |
| { |
| "epoch": 6.069094304388422, |
| "grad_norm": 0.7695748805999756, |
| "learning_rate": 2.007430263888549e-05, |
| "loss": 0.063, |
| "num_input_tokens_seen": 839848, |
| "step": 6500 |
| }, |
| { |
| "epoch": 6.073762838468721, |
| "grad_norm": 3.6968908309936523, |
| "learning_rate": 2.003436715453321e-05, |
| "loss": 0.1887, |
| "num_input_tokens_seen": 840520, |
| "step": 6505 |
| }, |
| { |
| "epoch": 6.078431372549019, |
| "grad_norm": 4.965339183807373, |
| "learning_rate": 1.999444485731138e-05, |
| "loss": 0.3157, |
| "num_input_tokens_seen": 841064, |
| "step": 6510 |
| }, |
| { |
| "epoch": 6.083099906629318, |
| "grad_norm": 4.380035400390625, |
| "learning_rate": 1.9954535853240837e-05, |
| "loss": 0.1266, |
| "num_input_tokens_seen": 841688, |
| "step": 6515 |
| }, |
| { |
| "epoch": 6.087768440709617, |
| "grad_norm": 9.54501724243164, |
| "learning_rate": 1.991464024830712e-05, |
| "loss": 0.3874, |
| "num_input_tokens_seen": 842312, |
| "step": 6520 |
| }, |
| { |
| "epoch": 6.092436974789916, |
| "grad_norm": 2.171372652053833, |
| "learning_rate": 1.987475814846017e-05, |
| "loss": 0.1375, |
| "num_input_tokens_seen": 842888, |
| "step": 6525 |
| }, |
| { |
| "epoch": 6.097105508870214, |
| "grad_norm": 1.786184310913086, |
| "learning_rate": 1.9834889659614082e-05, |
| "loss": 0.0824, |
| "num_input_tokens_seen": 843416, |
| "step": 6530 |
| }, |
| { |
| "epoch": 6.101774042950513, |
| "grad_norm": 0.32185816764831543, |
| "learning_rate": 1.979503488764678e-05, |
| "loss": 0.2355, |
| "num_input_tokens_seen": 844008, |
| "step": 6535 |
| }, |
| { |
| "epoch": 6.106442577030812, |
| "grad_norm": 0.2377963364124298, |
| "learning_rate": 1.975519393839978e-05, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 844600, |
| "step": 6540 |
| }, |
| { |
| "epoch": 6.111111111111111, |
| "grad_norm": 1.2736645936965942, |
| "learning_rate": 1.9715366917677878e-05, |
| "loss": 0.1354, |
| "num_input_tokens_seen": 845176, |
| "step": 6545 |
| }, |
| { |
| "epoch": 6.1157796451914095, |
| "grad_norm": 4.923788070678711, |
| "learning_rate": 1.967555393124889e-05, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 845784, |
| "step": 6550 |
| }, |
| { |
| "epoch": 6.120448179271708, |
| "grad_norm": 10.04391860961914, |
| "learning_rate": 1.9635755084843366e-05, |
| "loss": 0.11, |
| "num_input_tokens_seen": 846424, |
| "step": 6555 |
| }, |
| { |
| "epoch": 6.125116713352007, |
| "grad_norm": 1.872023105621338, |
| "learning_rate": 1.959597048415428e-05, |
| "loss": 0.2214, |
| "num_input_tokens_seen": 847080, |
| "step": 6560 |
| }, |
| { |
| "epoch": 6.129785247432307, |
| "grad_norm": 3.639702081680298, |
| "learning_rate": 1.9556200234836792e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 847864, |
| "step": 6565 |
| }, |
| { |
| "epoch": 6.1344537815126055, |
| "grad_norm": 5.251626491546631, |
| "learning_rate": 1.9516444442507947e-05, |
| "loss": 0.1082, |
| "num_input_tokens_seen": 848488, |
| "step": 6570 |
| }, |
| { |
| "epoch": 6.139122315592904, |
| "grad_norm": 3.5228431224823, |
| "learning_rate": 1.9476703212746413e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 849144, |
| "step": 6575 |
| }, |
| { |
| "epoch": 6.143790849673203, |
| "grad_norm": 4.5999650955200195, |
| "learning_rate": 1.9436976651092144e-05, |
| "loss": 0.1331, |
| "num_input_tokens_seen": 849768, |
| "step": 6580 |
| }, |
| { |
| "epoch": 6.148459383753502, |
| "grad_norm": 5.804399013519287, |
| "learning_rate": 1.9397264863046176e-05, |
| "loss": 0.3311, |
| "num_input_tokens_seen": 850472, |
| "step": 6585 |
| }, |
| { |
| "epoch": 6.1531279178338005, |
| "grad_norm": 2.8000004291534424, |
| "learning_rate": 1.9357567954070294e-05, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 851176, |
| "step": 6590 |
| }, |
| { |
| "epoch": 6.157796451914099, |
| "grad_norm": 4.5097503662109375, |
| "learning_rate": 1.9317886029586778e-05, |
| "loss": 0.092, |
| "num_input_tokens_seen": 851944, |
| "step": 6595 |
| }, |
| { |
| "epoch": 6.162464985994398, |
| "grad_norm": 1.2983640432357788, |
| "learning_rate": 1.927821919497812e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 852552, |
| "step": 6600 |
| }, |
| { |
| "epoch": 6.167133520074697, |
| "grad_norm": 7.8906354904174805, |
| "learning_rate": 1.9238567555586714e-05, |
| "loss": 0.172, |
| "num_input_tokens_seen": 853224, |
| "step": 6605 |
| }, |
| { |
| "epoch": 6.171802054154996, |
| "grad_norm": 8.38399887084961, |
| "learning_rate": 1.9198931216714614e-05, |
| "loss": 0.2156, |
| "num_input_tokens_seen": 853736, |
| "step": 6610 |
| }, |
| { |
| "epoch": 6.176470588235294, |
| "grad_norm": 0.570091724395752, |
| "learning_rate": 1.9159310283623245e-05, |
| "loss": 0.0985, |
| "num_input_tokens_seen": 854360, |
| "step": 6615 |
| }, |
| { |
| "epoch": 6.181139122315593, |
| "grad_norm": 0.9433493614196777, |
| "learning_rate": 1.911970486153312e-05, |
| "loss": 0.2101, |
| "num_input_tokens_seen": 855048, |
| "step": 6620 |
| }, |
| { |
| "epoch": 6.185807656395892, |
| "grad_norm": 12.825504302978516, |
| "learning_rate": 1.908011505562356e-05, |
| "loss": 0.1157, |
| "num_input_tokens_seen": 855720, |
| "step": 6625 |
| }, |
| { |
| "epoch": 6.190476190476191, |
| "grad_norm": 7.946544647216797, |
| "learning_rate": 1.9040540971032392e-05, |
| "loss": 0.1044, |
| "num_input_tokens_seen": 856264, |
| "step": 6630 |
| }, |
| { |
| "epoch": 6.1951447245564895, |
| "grad_norm": 6.417392253875732, |
| "learning_rate": 1.900098271285572e-05, |
| "loss": 0.2791, |
| "num_input_tokens_seen": 856856, |
| "step": 6635 |
| }, |
| { |
| "epoch": 6.199813258636788, |
| "grad_norm": 0.97379469871521, |
| "learning_rate": 1.896144038614761e-05, |
| "loss": 0.0588, |
| "num_input_tokens_seen": 857576, |
| "step": 6640 |
| }, |
| { |
| "epoch": 6.204481792717087, |
| "grad_norm": 0.8709614872932434, |
| "learning_rate": 1.8921914095919814e-05, |
| "loss": 0.098, |
| "num_input_tokens_seen": 858216, |
| "step": 6645 |
| }, |
| { |
| "epoch": 6.209150326797386, |
| "grad_norm": 17.944868087768555, |
| "learning_rate": 1.8882403947141507e-05, |
| "loss": 0.3388, |
| "num_input_tokens_seen": 858904, |
| "step": 6650 |
| }, |
| { |
| "epoch": 6.2138188608776845, |
| "grad_norm": 0.9572569727897644, |
| "learning_rate": 1.8842910044738975e-05, |
| "loss": 0.012, |
| "num_input_tokens_seen": 859656, |
| "step": 6655 |
| }, |
| { |
| "epoch": 6.218487394957983, |
| "grad_norm": 4.014954090118408, |
| "learning_rate": 1.8803432493595387e-05, |
| "loss": 0.1918, |
| "num_input_tokens_seen": 860264, |
| "step": 6660 |
| }, |
| { |
| "epoch": 6.223155929038282, |
| "grad_norm": 0.572869598865509, |
| "learning_rate": 1.876397139855047e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 860920, |
| "step": 6665 |
| }, |
| { |
| "epoch": 6.227824463118581, |
| "grad_norm": 3.1461665630340576, |
| "learning_rate": 1.8724526864400248e-05, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 861528, |
| "step": 6670 |
| }, |
| { |
| "epoch": 6.23249299719888, |
| "grad_norm": 10.453712463378906, |
| "learning_rate": 1.8685098995896792e-05, |
| "loss": 0.1564, |
| "num_input_tokens_seen": 862104, |
| "step": 6675 |
| }, |
| { |
| "epoch": 6.237161531279178, |
| "grad_norm": 0.3828265070915222, |
| "learning_rate": 1.8645687897747864e-05, |
| "loss": 0.2306, |
| "num_input_tokens_seen": 862840, |
| "step": 6680 |
| }, |
| { |
| "epoch": 6.241830065359477, |
| "grad_norm": 8.274612426757812, |
| "learning_rate": 1.8606293674616737e-05, |
| "loss": 0.1782, |
| "num_input_tokens_seen": 863480, |
| "step": 6685 |
| }, |
| { |
| "epoch": 6.246498599439776, |
| "grad_norm": 3.889509677886963, |
| "learning_rate": 1.856691643112184e-05, |
| "loss": 0.1569, |
| "num_input_tokens_seen": 864152, |
| "step": 6690 |
| }, |
| { |
| "epoch": 6.251167133520075, |
| "grad_norm": 0.07747960835695267, |
| "learning_rate": 1.8527556271836524e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 864824, |
| "step": 6695 |
| }, |
| { |
| "epoch": 6.2558356676003735, |
| "grad_norm": 1.108803629875183, |
| "learning_rate": 1.848821330128878e-05, |
| "loss": 0.0478, |
| "num_input_tokens_seen": 865464, |
| "step": 6700 |
| }, |
| { |
| "epoch": 6.260504201680672, |
| "grad_norm": 1.7075055837631226, |
| "learning_rate": 1.844888762396092e-05, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 866184, |
| "step": 6705 |
| }, |
| { |
| "epoch": 6.265172735760971, |
| "grad_norm": 3.643524169921875, |
| "learning_rate": 1.8409579344289342e-05, |
| "loss": 0.0464, |
| "num_input_tokens_seen": 866872, |
| "step": 6710 |
| }, |
| { |
| "epoch": 6.26984126984127, |
| "grad_norm": 13.78272533416748, |
| "learning_rate": 1.8370288566664262e-05, |
| "loss": 0.2371, |
| "num_input_tokens_seen": 867496, |
| "step": 6715 |
| }, |
| { |
| "epoch": 6.2745098039215685, |
| "grad_norm": 3.8864872455596924, |
| "learning_rate": 1.83310153954294e-05, |
| "loss": 0.4803, |
| "num_input_tokens_seen": 868072, |
| "step": 6720 |
| }, |
| { |
| "epoch": 6.279178338001867, |
| "grad_norm": 3.947277069091797, |
| "learning_rate": 1.829175993488172e-05, |
| "loss": 0.1295, |
| "num_input_tokens_seen": 868744, |
| "step": 6725 |
| }, |
| { |
| "epoch": 6.283846872082166, |
| "grad_norm": 11.612577438354492, |
| "learning_rate": 1.8252522289271142e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 869416, |
| "step": 6730 |
| }, |
| { |
| "epoch": 6.288515406162465, |
| "grad_norm": 4.163094520568848, |
| "learning_rate": 1.8213302562800294e-05, |
| "loss": 0.2386, |
| "num_input_tokens_seen": 870056, |
| "step": 6735 |
| }, |
| { |
| "epoch": 6.293183940242764, |
| "grad_norm": 8.031195640563965, |
| "learning_rate": 1.817410085962421e-05, |
| "loss": 0.206, |
| "num_input_tokens_seen": 870696, |
| "step": 6740 |
| }, |
| { |
| "epoch": 6.297852474323062, |
| "grad_norm": 5.002910614013672, |
| "learning_rate": 1.8134917283850053e-05, |
| "loss": 0.109, |
| "num_input_tokens_seen": 871272, |
| "step": 6745 |
| }, |
| { |
| "epoch": 6.302521008403361, |
| "grad_norm": 5.592998504638672, |
| "learning_rate": 1.8095751939536866e-05, |
| "loss": 0.1421, |
| "num_input_tokens_seen": 871896, |
| "step": 6750 |
| }, |
| { |
| "epoch": 6.30718954248366, |
| "grad_norm": 2.709061622619629, |
| "learning_rate": 1.8056604930695232e-05, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 872616, |
| "step": 6755 |
| }, |
| { |
| "epoch": 6.311858076563959, |
| "grad_norm": 1.0777950286865234, |
| "learning_rate": 1.8017476361287087e-05, |
| "loss": 0.1222, |
| "num_input_tokens_seen": 873336, |
| "step": 6760 |
| }, |
| { |
| "epoch": 6.3165266106442575, |
| "grad_norm": 0.15798994898796082, |
| "learning_rate": 1.797836633522538e-05, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 874008, |
| "step": 6765 |
| }, |
| { |
| "epoch": 6.321195144724556, |
| "grad_norm": 1.92898428440094, |
| "learning_rate": 1.7939274956373813e-05, |
| "loss": 0.1037, |
| "num_input_tokens_seen": 874696, |
| "step": 6770 |
| }, |
| { |
| "epoch": 6.325863678804855, |
| "grad_norm": 2.868745803833008, |
| "learning_rate": 1.7900202328546557e-05, |
| "loss": 0.0836, |
| "num_input_tokens_seen": 875336, |
| "step": 6775 |
| }, |
| { |
| "epoch": 6.330532212885154, |
| "grad_norm": 3.641486167907715, |
| "learning_rate": 1.7861148555508007e-05, |
| "loss": 0.2157, |
| "num_input_tokens_seen": 875960, |
| "step": 6780 |
| }, |
| { |
| "epoch": 6.3352007469654525, |
| "grad_norm": 1.8708469867706299, |
| "learning_rate": 1.7822113740972478e-05, |
| "loss": 0.0314, |
| "num_input_tokens_seen": 876712, |
| "step": 6785 |
| }, |
| { |
| "epoch": 6.339869281045751, |
| "grad_norm": 0.6900608539581299, |
| "learning_rate": 1.778309798860393e-05, |
| "loss": 0.09, |
| "num_input_tokens_seen": 877352, |
| "step": 6790 |
| }, |
| { |
| "epoch": 6.34453781512605, |
| "grad_norm": 10.317023277282715, |
| "learning_rate": 1.7744101402015716e-05, |
| "loss": 0.3264, |
| "num_input_tokens_seen": 877944, |
| "step": 6795 |
| }, |
| { |
| "epoch": 6.349206349206349, |
| "grad_norm": 3.3235373497009277, |
| "learning_rate": 1.770512408477026e-05, |
| "loss": 0.0738, |
| "num_input_tokens_seen": 878568, |
| "step": 6800 |
| }, |
| { |
| "epoch": 6.353874883286648, |
| "grad_norm": 4.6949334144592285, |
| "learning_rate": 1.7666166140378852e-05, |
| "loss": 0.2285, |
| "num_input_tokens_seen": 879128, |
| "step": 6805 |
| }, |
| { |
| "epoch": 6.358543417366946, |
| "grad_norm": 4.452512741088867, |
| "learning_rate": 1.7627227672301302e-05, |
| "loss": 0.2037, |
| "num_input_tokens_seen": 879784, |
| "step": 6810 |
| }, |
| { |
| "epoch": 6.363211951447245, |
| "grad_norm": 2.7044475078582764, |
| "learning_rate": 1.7588308783945717e-05, |
| "loss": 0.053, |
| "num_input_tokens_seen": 880536, |
| "step": 6815 |
| }, |
| { |
| "epoch": 6.367880485527545, |
| "grad_norm": 15.64172077178955, |
| "learning_rate": 1.7549409578668206e-05, |
| "loss": 0.2094, |
| "num_input_tokens_seen": 881128, |
| "step": 6820 |
| }, |
| { |
| "epoch": 6.372549019607844, |
| "grad_norm": 19.395587921142578, |
| "learning_rate": 1.7510530159772586e-05, |
| "loss": 0.281, |
| "num_input_tokens_seen": 881736, |
| "step": 6825 |
| }, |
| { |
| "epoch": 6.377217553688142, |
| "grad_norm": 2.9765985012054443, |
| "learning_rate": 1.7471670630510152e-05, |
| "loss": 0.1436, |
| "num_input_tokens_seen": 882424, |
| "step": 6830 |
| }, |
| { |
| "epoch": 6.381886087768441, |
| "grad_norm": 1.3620012998580933, |
| "learning_rate": 1.7432831094079355e-05, |
| "loss": 0.0742, |
| "num_input_tokens_seen": 883176, |
| "step": 6835 |
| }, |
| { |
| "epoch": 6.38655462184874, |
| "grad_norm": 3.0921950340270996, |
| "learning_rate": 1.739401165362557e-05, |
| "loss": 0.1604, |
| "num_input_tokens_seen": 883816, |
| "step": 6840 |
| }, |
| { |
| "epoch": 6.391223155929039, |
| "grad_norm": 0.9669636487960815, |
| "learning_rate": 1.7355212412240817e-05, |
| "loss": 0.137, |
| "num_input_tokens_seen": 884440, |
| "step": 6845 |
| }, |
| { |
| "epoch": 6.395891690009337, |
| "grad_norm": 2.259877920150757, |
| "learning_rate": 1.7316433472963427e-05, |
| "loss": 0.4171, |
| "num_input_tokens_seen": 885144, |
| "step": 6850 |
| }, |
| { |
| "epoch": 6.400560224089636, |
| "grad_norm": 1.4858440160751343, |
| "learning_rate": 1.7277674938777855e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 885928, |
| "step": 6855 |
| }, |
| { |
| "epoch": 6.405228758169935, |
| "grad_norm": 6.3448591232299805, |
| "learning_rate": 1.723893691261435e-05, |
| "loss": 0.2404, |
| "num_input_tokens_seen": 886568, |
| "step": 6860 |
| }, |
| { |
| "epoch": 6.409897292250234, |
| "grad_norm": 7.592992305755615, |
| "learning_rate": 1.7200219497348707e-05, |
| "loss": 0.192, |
| "num_input_tokens_seen": 887224, |
| "step": 6865 |
| }, |
| { |
| "epoch": 6.4145658263305325, |
| "grad_norm": 1.5566842555999756, |
| "learning_rate": 1.716152279580199e-05, |
| "loss": 0.1125, |
| "num_input_tokens_seen": 887864, |
| "step": 6870 |
| }, |
| { |
| "epoch": 6.419234360410831, |
| "grad_norm": 6.8202128410339355, |
| "learning_rate": 1.712284691074022e-05, |
| "loss": 0.1109, |
| "num_input_tokens_seen": 888552, |
| "step": 6875 |
| }, |
| { |
| "epoch": 6.42390289449113, |
| "grad_norm": 6.580783367156982, |
| "learning_rate": 1.7084191944874174e-05, |
| "loss": 0.2174, |
| "num_input_tokens_seen": 889160, |
| "step": 6880 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 1.1786524057388306, |
| "learning_rate": 1.7045558000859068e-05, |
| "loss": 0.0866, |
| "num_input_tokens_seen": 889800, |
| "step": 6885 |
| }, |
| { |
| "epoch": 6.433239962651728, |
| "grad_norm": 0.5497421026229858, |
| "learning_rate": 1.7006945181294275e-05, |
| "loss": 0.0944, |
| "num_input_tokens_seen": 890440, |
| "step": 6890 |
| }, |
| { |
| "epoch": 6.437908496732026, |
| "grad_norm": 0.2169853150844574, |
| "learning_rate": 1.6968353588723084e-05, |
| "loss": 0.2037, |
| "num_input_tokens_seen": 891112, |
| "step": 6895 |
| }, |
| { |
| "epoch": 6.442577030812325, |
| "grad_norm": 2.8242037296295166, |
| "learning_rate": 1.6929783325632393e-05, |
| "loss": 0.0724, |
| "num_input_tokens_seen": 891816, |
| "step": 6900 |
| }, |
| { |
| "epoch": 6.447245564892624, |
| "grad_norm": 5.830324172973633, |
| "learning_rate": 1.6891234494452476e-05, |
| "loss": 0.2002, |
| "num_input_tokens_seen": 892408, |
| "step": 6905 |
| }, |
| { |
| "epoch": 6.451914098972923, |
| "grad_norm": 0.2872973084449768, |
| "learning_rate": 1.6852707197556677e-05, |
| "loss": 0.043, |
| "num_input_tokens_seen": 893048, |
| "step": 6910 |
| }, |
| { |
| "epoch": 6.456582633053221, |
| "grad_norm": 4.120421409606934, |
| "learning_rate": 1.6814201537261162e-05, |
| "loss": 0.187, |
| "num_input_tokens_seen": 893672, |
| "step": 6915 |
| }, |
| { |
| "epoch": 6.46125116713352, |
| "grad_norm": 5.557107448577881, |
| "learning_rate": 1.677571761582464e-05, |
| "loss": 0.1201, |
| "num_input_tokens_seen": 894344, |
| "step": 6920 |
| }, |
| { |
| "epoch": 6.465919701213819, |
| "grad_norm": 7.2985076904296875, |
| "learning_rate": 1.6737255535448063e-05, |
| "loss": 0.2039, |
| "num_input_tokens_seen": 894920, |
| "step": 6925 |
| }, |
| { |
| "epoch": 6.470588235294118, |
| "grad_norm": 0.9252108931541443, |
| "learning_rate": 1.669881539827441e-05, |
| "loss": 0.1017, |
| "num_input_tokens_seen": 895512, |
| "step": 6930 |
| }, |
| { |
| "epoch": 6.4752567693744165, |
| "grad_norm": 13.045816421508789, |
| "learning_rate": 1.6660397306388364e-05, |
| "loss": 0.1111, |
| "num_input_tokens_seen": 896216, |
| "step": 6935 |
| }, |
| { |
| "epoch": 6.479925303454715, |
| "grad_norm": 3.7154791355133057, |
| "learning_rate": 1.662200136181609e-05, |
| "loss": 0.1685, |
| "num_input_tokens_seen": 896856, |
| "step": 6940 |
| }, |
| { |
| "epoch": 6.484593837535014, |
| "grad_norm": 1.7174103260040283, |
| "learning_rate": 1.6583627666524902e-05, |
| "loss": 0.1315, |
| "num_input_tokens_seen": 897560, |
| "step": 6945 |
| }, |
| { |
| "epoch": 6.489262371615313, |
| "grad_norm": 2.6956374645233154, |
| "learning_rate": 1.6545276322423054e-05, |
| "loss": 0.102, |
| "num_input_tokens_seen": 898216, |
| "step": 6950 |
| }, |
| { |
| "epoch": 6.493930905695612, |
| "grad_norm": 2.201688766479492, |
| "learning_rate": 1.650694743135942e-05, |
| "loss": 0.065, |
| "num_input_tokens_seen": 898840, |
| "step": 6955 |
| }, |
| { |
| "epoch": 6.49859943977591, |
| "grad_norm": 7.494628429412842, |
| "learning_rate": 1.6468641095123273e-05, |
| "loss": 0.3311, |
| "num_input_tokens_seen": 899464, |
| "step": 6960 |
| }, |
| { |
| "epoch": 6.503267973856209, |
| "grad_norm": 3.3498306274414062, |
| "learning_rate": 1.643035741544398e-05, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 900152, |
| "step": 6965 |
| }, |
| { |
| "epoch": 6.506069094304388, |
| "eval_loss": 0.9735978245735168, |
| "eval_runtime": 3.8751, |
| "eval_samples_per_second": 61.418, |
| "eval_steps_per_second": 30.709, |
| "num_input_tokens_seen": 900568, |
| "step": 6968 |
| }, |
| { |
| "epoch": 6.507936507936508, |
| "grad_norm": 3.840616464614868, |
| "learning_rate": 1.6392096493990713e-05, |
| "loss": 0.1514, |
| "num_input_tokens_seen": 900792, |
| "step": 6970 |
| }, |
| { |
| "epoch": 6.512605042016807, |
| "grad_norm": 0.730390191078186, |
| "learning_rate": 1.6353858432372228e-05, |
| "loss": 0.3106, |
| "num_input_tokens_seen": 901464, |
| "step": 6975 |
| }, |
| { |
| "epoch": 6.5172735760971054, |
| "grad_norm": 6.283236980438232, |
| "learning_rate": 1.631564333213658e-05, |
| "loss": 0.2399, |
| "num_input_tokens_seen": 902088, |
| "step": 6980 |
| }, |
| { |
| "epoch": 6.521942110177404, |
| "grad_norm": 5.225600242614746, |
| "learning_rate": 1.6277451294770834e-05, |
| "loss": 0.1521, |
| "num_input_tokens_seen": 902712, |
| "step": 6985 |
| }, |
| { |
| "epoch": 6.526610644257703, |
| "grad_norm": 0.35495224595069885, |
| "learning_rate": 1.6239282421700807e-05, |
| "loss": 0.2793, |
| "num_input_tokens_seen": 903448, |
| "step": 6990 |
| }, |
| { |
| "epoch": 6.531279178338002, |
| "grad_norm": 5.897226810455322, |
| "learning_rate": 1.6201136814290802e-05, |
| "loss": 0.0455, |
| "num_input_tokens_seen": 904088, |
| "step": 6995 |
| }, |
| { |
| "epoch": 6.5359477124183005, |
| "grad_norm": 9.795798301696777, |
| "learning_rate": 1.6163014573843323e-05, |
| "loss": 0.1692, |
| "num_input_tokens_seen": 904728, |
| "step": 7000 |
| }, |
| { |
| "epoch": 6.540616246498599, |
| "grad_norm": 5.636192321777344, |
| "learning_rate": 1.6124915801598852e-05, |
| "loss": 0.0688, |
| "num_input_tokens_seen": 905304, |
| "step": 7005 |
| }, |
| { |
| "epoch": 6.545284780578898, |
| "grad_norm": 14.28243350982666, |
| "learning_rate": 1.6086840598735507e-05, |
| "loss": 0.023, |
| "num_input_tokens_seen": 905896, |
| "step": 7010 |
| }, |
| { |
| "epoch": 6.549953314659197, |
| "grad_norm": 0.4303819537162781, |
| "learning_rate": 1.6048789066368858e-05, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 906552, |
| "step": 7015 |
| }, |
| { |
| "epoch": 6.554621848739496, |
| "grad_norm": 8.530994415283203, |
| "learning_rate": 1.6010761305551553e-05, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 907160, |
| "step": 7020 |
| }, |
| { |
| "epoch": 6.559290382819794, |
| "grad_norm": 0.3170902132987976, |
| "learning_rate": 1.5972757417273166e-05, |
| "loss": 0.1061, |
| "num_input_tokens_seen": 907784, |
| "step": 7025 |
| }, |
| { |
| "epoch": 6.563958916900093, |
| "grad_norm": 0.19078557193279266, |
| "learning_rate": 1.5934777502459855e-05, |
| "loss": 0.1826, |
| "num_input_tokens_seen": 908456, |
| "step": 7030 |
| }, |
| { |
| "epoch": 6.568627450980392, |
| "grad_norm": 3.0452566146850586, |
| "learning_rate": 1.5896821661974098e-05, |
| "loss": 0.116, |
| "num_input_tokens_seen": 909064, |
| "step": 7035 |
| }, |
| { |
| "epoch": 6.573295985060691, |
| "grad_norm": 3.596776008605957, |
| "learning_rate": 1.5858889996614468e-05, |
| "loss": 0.0836, |
| "num_input_tokens_seen": 909656, |
| "step": 7040 |
| }, |
| { |
| "epoch": 6.5779645191409895, |
| "grad_norm": 0.0865001380443573, |
| "learning_rate": 1.5820982607115297e-05, |
| "loss": 0.0932, |
| "num_input_tokens_seen": 910280, |
| "step": 7045 |
| }, |
| { |
| "epoch": 6.582633053221288, |
| "grad_norm": 6.569155216217041, |
| "learning_rate": 1.578309959414649e-05, |
| "loss": 0.1726, |
| "num_input_tokens_seen": 910872, |
| "step": 7050 |
| }, |
| { |
| "epoch": 6.587301587301587, |
| "grad_norm": 0.8266429305076599, |
| "learning_rate": 1.574524105831318e-05, |
| "loss": 0.1021, |
| "num_input_tokens_seen": 911560, |
| "step": 7055 |
| }, |
| { |
| "epoch": 6.591970121381886, |
| "grad_norm": 46.219268798828125, |
| "learning_rate": 1.5707407100155517e-05, |
| "loss": 0.2182, |
| "num_input_tokens_seen": 912184, |
| "step": 7060 |
| }, |
| { |
| "epoch": 6.5966386554621845, |
| "grad_norm": 14.47891902923584, |
| "learning_rate": 1.5669597820148398e-05, |
| "loss": 0.201, |
| "num_input_tokens_seen": 912824, |
| "step": 7065 |
| }, |
| { |
| "epoch": 6.601307189542483, |
| "grad_norm": 0.10131403058767319, |
| "learning_rate": 1.5631813318701138e-05, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 913528, |
| "step": 7070 |
| }, |
| { |
| "epoch": 6.605975723622782, |
| "grad_norm": 4.317729949951172, |
| "learning_rate": 1.559405369615727e-05, |
| "loss": 0.1386, |
| "num_input_tokens_seen": 914168, |
| "step": 7075 |
| }, |
| { |
| "epoch": 6.610644257703081, |
| "grad_norm": 10.373050689697266, |
| "learning_rate": 1.5556319052794267e-05, |
| "loss": 0.2691, |
| "num_input_tokens_seen": 914856, |
| "step": 7080 |
| }, |
| { |
| "epoch": 6.61531279178338, |
| "grad_norm": 0.2939615845680237, |
| "learning_rate": 1.5518609488823258e-05, |
| "loss": 0.1503, |
| "num_input_tokens_seen": 915432, |
| "step": 7085 |
| }, |
| { |
| "epoch": 6.619981325863678, |
| "grad_norm": 2.4400196075439453, |
| "learning_rate": 1.5480925104388762e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 916072, |
| "step": 7090 |
| }, |
| { |
| "epoch": 6.624649859943977, |
| "grad_norm": 2.928807497024536, |
| "learning_rate": 1.544326599956844e-05, |
| "loss": 0.1635, |
| "num_input_tokens_seen": 916728, |
| "step": 7095 |
| }, |
| { |
| "epoch": 6.629318394024276, |
| "grad_norm": 0.10388700664043427, |
| "learning_rate": 1.54056322743728e-05, |
| "loss": 0.132, |
| "num_input_tokens_seen": 917304, |
| "step": 7100 |
| }, |
| { |
| "epoch": 6.633986928104575, |
| "grad_norm": 2.3083953857421875, |
| "learning_rate": 1.5368024028744976e-05, |
| "loss": 0.1171, |
| "num_input_tokens_seen": 918024, |
| "step": 7105 |
| }, |
| { |
| "epoch": 6.6386554621848735, |
| "grad_norm": 16.023496627807617, |
| "learning_rate": 1.5330441362560425e-05, |
| "loss": 0.1859, |
| "num_input_tokens_seen": 918760, |
| "step": 7110 |
| }, |
| { |
| "epoch": 6.643323996265173, |
| "grad_norm": 0.659245491027832, |
| "learning_rate": 1.5292884375626664e-05, |
| "loss": 0.0545, |
| "num_input_tokens_seen": 919400, |
| "step": 7115 |
| }, |
| { |
| "epoch": 6.647992530345472, |
| "grad_norm": 0.08721905946731567, |
| "learning_rate": 1.5255353167683017e-05, |
| "loss": 0.0533, |
| "num_input_tokens_seen": 919976, |
| "step": 7120 |
| }, |
| { |
| "epoch": 6.652661064425771, |
| "grad_norm": 7.850348949432373, |
| "learning_rate": 1.5217847838400362e-05, |
| "loss": 0.2076, |
| "num_input_tokens_seen": 920664, |
| "step": 7125 |
| }, |
| { |
| "epoch": 6.657329598506069, |
| "grad_norm": 1.92311692237854, |
| "learning_rate": 1.5180368487380839e-05, |
| "loss": 0.133, |
| "num_input_tokens_seen": 921336, |
| "step": 7130 |
| }, |
| { |
| "epoch": 6.661998132586368, |
| "grad_norm": 5.412630081176758, |
| "learning_rate": 1.5142915214157605e-05, |
| "loss": 0.2534, |
| "num_input_tokens_seen": 921976, |
| "step": 7135 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 5.1437177658081055, |
| "learning_rate": 1.5105488118194544e-05, |
| "loss": 0.2151, |
| "num_input_tokens_seen": 922520, |
| "step": 7140 |
| }, |
| { |
| "epoch": 6.671335200746966, |
| "grad_norm": 2.109266996383667, |
| "learning_rate": 1.5068087298886041e-05, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 923128, |
| "step": 7145 |
| }, |
| { |
| "epoch": 6.6760037348272645, |
| "grad_norm": 4.490639686584473, |
| "learning_rate": 1.5030712855556705e-05, |
| "loss": 0.136, |
| "num_input_tokens_seen": 923832, |
| "step": 7150 |
| }, |
| { |
| "epoch": 6.680672268907563, |
| "grad_norm": 5.7482008934021, |
| "learning_rate": 1.499336488746107e-05, |
| "loss": 0.1265, |
| "num_input_tokens_seen": 924552, |
| "step": 7155 |
| }, |
| { |
| "epoch": 6.685340802987862, |
| "grad_norm": 0.4535566568374634, |
| "learning_rate": 1.4956043493783401e-05, |
| "loss": 0.0958, |
| "num_input_tokens_seen": 925208, |
| "step": 7160 |
| }, |
| { |
| "epoch": 6.690009337068161, |
| "grad_norm": 1.7882232666015625, |
| "learning_rate": 1.4918748773637337e-05, |
| "loss": 0.0885, |
| "num_input_tokens_seen": 925976, |
| "step": 7165 |
| }, |
| { |
| "epoch": 6.69467787114846, |
| "grad_norm": 3.1727871894836426, |
| "learning_rate": 1.4881480826065736e-05, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 926648, |
| "step": 7170 |
| }, |
| { |
| "epoch": 6.699346405228758, |
| "grad_norm": 0.20563183724880219, |
| "learning_rate": 1.4844239750040308e-05, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 927272, |
| "step": 7175 |
| }, |
| { |
| "epoch": 6.704014939309057, |
| "grad_norm": 3.364445686340332, |
| "learning_rate": 1.4807025644461436e-05, |
| "loss": 0.0427, |
| "num_input_tokens_seen": 927880, |
| "step": 7180 |
| }, |
| { |
| "epoch": 6.708683473389356, |
| "grad_norm": 1.1256296634674072, |
| "learning_rate": 1.4769838608157877e-05, |
| "loss": 0.1734, |
| "num_input_tokens_seen": 928568, |
| "step": 7185 |
| }, |
| { |
| "epoch": 6.713352007469655, |
| "grad_norm": 0.5387191772460938, |
| "learning_rate": 1.4732678739886468e-05, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 929224, |
| "step": 7190 |
| }, |
| { |
| "epoch": 6.718020541549953, |
| "grad_norm": 5.2028398513793945, |
| "learning_rate": 1.4695546138331928e-05, |
| "loss": 0.1926, |
| "num_input_tokens_seen": 929832, |
| "step": 7195 |
| }, |
| { |
| "epoch": 6.722689075630252, |
| "grad_norm": 0.20318341255187988, |
| "learning_rate": 1.465844090210655e-05, |
| "loss": 0.07, |
| "num_input_tokens_seen": 930488, |
| "step": 7200 |
| }, |
| { |
| "epoch": 6.727357609710551, |
| "grad_norm": 8.667532920837402, |
| "learning_rate": 1.4621363129749958e-05, |
| "loss": 0.098, |
| "num_input_tokens_seen": 931128, |
| "step": 7205 |
| }, |
| { |
| "epoch": 6.73202614379085, |
| "grad_norm": 3.0935397148132324, |
| "learning_rate": 1.4584312919728853e-05, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 931688, |
| "step": 7210 |
| }, |
| { |
| "epoch": 6.7366946778711485, |
| "grad_norm": 3.340531349182129, |
| "learning_rate": 1.4547290370436707e-05, |
| "loss": 0.245, |
| "num_input_tokens_seen": 932360, |
| "step": 7215 |
| }, |
| { |
| "epoch": 6.741363211951447, |
| "grad_norm": 1.7328623533248901, |
| "learning_rate": 1.451029558019356e-05, |
| "loss": 0.0781, |
| "num_input_tokens_seen": 933000, |
| "step": 7220 |
| }, |
| { |
| "epoch": 6.746031746031746, |
| "grad_norm": 12.751051902770996, |
| "learning_rate": 1.4473328647245726e-05, |
| "loss": 0.0738, |
| "num_input_tokens_seen": 933672, |
| "step": 7225 |
| }, |
| { |
| "epoch": 6.750700280112045, |
| "grad_norm": 3.262699842453003, |
| "learning_rate": 1.4436389669765543e-05, |
| "loss": 0.1764, |
| "num_input_tokens_seen": 934296, |
| "step": 7230 |
| }, |
| { |
| "epoch": 6.755368814192344, |
| "grad_norm": 0.568540632724762, |
| "learning_rate": 1.4399478745851107e-05, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 935048, |
| "step": 7235 |
| }, |
| { |
| "epoch": 6.760037348272642, |
| "grad_norm": 8.316237449645996, |
| "learning_rate": 1.4362595973526005e-05, |
| "loss": 0.1433, |
| "num_input_tokens_seen": 935720, |
| "step": 7240 |
| }, |
| { |
| "epoch": 6.764705882352941, |
| "grad_norm": 4.168196678161621, |
| "learning_rate": 1.4325741450739072e-05, |
| "loss": 0.1233, |
| "num_input_tokens_seen": 936360, |
| "step": 7245 |
| }, |
| { |
| "epoch": 6.76937441643324, |
| "grad_norm": 0.5175381302833557, |
| "learning_rate": 1.4288915275364107e-05, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 937000, |
| "step": 7250 |
| }, |
| { |
| "epoch": 6.774042950513539, |
| "grad_norm": 0.7255039215087891, |
| "learning_rate": 1.425211754519964e-05, |
| "loss": 0.1635, |
| "num_input_tokens_seen": 937608, |
| "step": 7255 |
| }, |
| { |
| "epoch": 6.778711484593837, |
| "grad_norm": 1.7699940204620361, |
| "learning_rate": 1.4215348357968669e-05, |
| "loss": 0.0913, |
| "num_input_tokens_seen": 938200, |
| "step": 7260 |
| }, |
| { |
| "epoch": 6.783380018674136, |
| "grad_norm": 6.348759651184082, |
| "learning_rate": 1.4178607811318361e-05, |
| "loss": 0.2961, |
| "num_input_tokens_seen": 938824, |
| "step": 7265 |
| }, |
| { |
| "epoch": 6.788048552754435, |
| "grad_norm": 10.6443510055542, |
| "learning_rate": 1.4141896002819854e-05, |
| "loss": 0.0846, |
| "num_input_tokens_seen": 939528, |
| "step": 7270 |
| }, |
| { |
| "epoch": 6.792717086834734, |
| "grad_norm": 0.29247570037841797, |
| "learning_rate": 1.4105213029967945e-05, |
| "loss": 0.2019, |
| "num_input_tokens_seen": 940120, |
| "step": 7275 |
| }, |
| { |
| "epoch": 6.7973856209150325, |
| "grad_norm": 2.046889305114746, |
| "learning_rate": 1.4068558990180875e-05, |
| "loss": 0.2389, |
| "num_input_tokens_seen": 940792, |
| "step": 7280 |
| }, |
| { |
| "epoch": 6.802054154995331, |
| "grad_norm": 25.0539493560791, |
| "learning_rate": 1.4031933980800028e-05, |
| "loss": 0.5873, |
| "num_input_tokens_seen": 941416, |
| "step": 7285 |
| }, |
| { |
| "epoch": 6.80672268907563, |
| "grad_norm": 2.1702511310577393, |
| "learning_rate": 1.399533809908968e-05, |
| "loss": 0.0993, |
| "num_input_tokens_seen": 942024, |
| "step": 7290 |
| }, |
| { |
| "epoch": 6.811391223155929, |
| "grad_norm": 1.2021642923355103, |
| "learning_rate": 1.395877144223679e-05, |
| "loss": 0.1214, |
| "num_input_tokens_seen": 942712, |
| "step": 7295 |
| }, |
| { |
| "epoch": 6.816059757236228, |
| "grad_norm": 5.554089069366455, |
| "learning_rate": 1.3922234107350684e-05, |
| "loss": 0.0719, |
| "num_input_tokens_seen": 943368, |
| "step": 7300 |
| }, |
| { |
| "epoch": 6.820728291316526, |
| "grad_norm": 4.765249252319336, |
| "learning_rate": 1.388572619146283e-05, |
| "loss": 0.1859, |
| "num_input_tokens_seen": 944088, |
| "step": 7305 |
| }, |
| { |
| "epoch": 6.825396825396825, |
| "grad_norm": 3.2210237979888916, |
| "learning_rate": 1.3849247791526543e-05, |
| "loss": 0.165, |
| "num_input_tokens_seen": 944728, |
| "step": 7310 |
| }, |
| { |
| "epoch": 6.830065359477124, |
| "grad_norm": 0.09927883744239807, |
| "learning_rate": 1.3812799004416779e-05, |
| "loss": 0.0848, |
| "num_input_tokens_seen": 945352, |
| "step": 7315 |
| }, |
| { |
| "epoch": 6.834733893557423, |
| "grad_norm": 4.660610675811768, |
| "learning_rate": 1.3776379926929842e-05, |
| "loss": 0.1015, |
| "num_input_tokens_seen": 946008, |
| "step": 7320 |
| }, |
| { |
| "epoch": 6.839402427637721, |
| "grad_norm": 2.595057249069214, |
| "learning_rate": 1.3739990655783147e-05, |
| "loss": 0.0979, |
| "num_input_tokens_seen": 946584, |
| "step": 7325 |
| }, |
| { |
| "epoch": 6.84407096171802, |
| "grad_norm": 4.264656066894531, |
| "learning_rate": 1.3703631287614935e-05, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 947224, |
| "step": 7330 |
| }, |
| { |
| "epoch": 6.848739495798319, |
| "grad_norm": 4.981330871582031, |
| "learning_rate": 1.3667301918984032e-05, |
| "loss": 0.3648, |
| "num_input_tokens_seen": 947896, |
| "step": 7335 |
| }, |
| { |
| "epoch": 6.853408029878618, |
| "grad_norm": 3.690455913543701, |
| "learning_rate": 1.3631002646369615e-05, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 948552, |
| "step": 7340 |
| }, |
| { |
| "epoch": 6.8580765639589165, |
| "grad_norm": 1.3205879926681519, |
| "learning_rate": 1.3594733566170926e-05, |
| "loss": 0.0872, |
| "num_input_tokens_seen": 949112, |
| "step": 7345 |
| }, |
| { |
| "epoch": 6.862745098039216, |
| "grad_norm": 1.1191065311431885, |
| "learning_rate": 1.3558494774707026e-05, |
| "loss": 0.1099, |
| "num_input_tokens_seen": 949816, |
| "step": 7350 |
| }, |
| { |
| "epoch": 6.867413632119515, |
| "grad_norm": 6.446228504180908, |
| "learning_rate": 1.3522286368216553e-05, |
| "loss": 0.2311, |
| "num_input_tokens_seen": 950408, |
| "step": 7355 |
| }, |
| { |
| "epoch": 6.872082166199814, |
| "grad_norm": 2.946200370788574, |
| "learning_rate": 1.3486108442857412e-05, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 951016, |
| "step": 7360 |
| }, |
| { |
| "epoch": 6.8767507002801125, |
| "grad_norm": 0.6405202746391296, |
| "learning_rate": 1.3449961094706606e-05, |
| "loss": 0.1807, |
| "num_input_tokens_seen": 951656, |
| "step": 7365 |
| }, |
| { |
| "epoch": 6.881419234360411, |
| "grad_norm": 3.961355209350586, |
| "learning_rate": 1.34138444197599e-05, |
| "loss": 0.1963, |
| "num_input_tokens_seen": 952328, |
| "step": 7370 |
| }, |
| { |
| "epoch": 6.88608776844071, |
| "grad_norm": 4.892948627471924, |
| "learning_rate": 1.3377758513931621e-05, |
| "loss": 0.1093, |
| "num_input_tokens_seen": 952968, |
| "step": 7375 |
| }, |
| { |
| "epoch": 6.890756302521009, |
| "grad_norm": 1.85221266746521, |
| "learning_rate": 1.3341703473054384e-05, |
| "loss": 0.2895, |
| "num_input_tokens_seen": 953640, |
| "step": 7380 |
| }, |
| { |
| "epoch": 6.895424836601308, |
| "grad_norm": 5.994509220123291, |
| "learning_rate": 1.3305679392878817e-05, |
| "loss": 0.2183, |
| "num_input_tokens_seen": 954248, |
| "step": 7385 |
| }, |
| { |
| "epoch": 6.900093370681606, |
| "grad_norm": 12.080351829528809, |
| "learning_rate": 1.3269686369073347e-05, |
| "loss": 0.2908, |
| "num_input_tokens_seen": 954856, |
| "step": 7390 |
| }, |
| { |
| "epoch": 6.904761904761905, |
| "grad_norm": 2.8995492458343506, |
| "learning_rate": 1.3233724497223914e-05, |
| "loss": 0.1119, |
| "num_input_tokens_seen": 955496, |
| "step": 7395 |
| }, |
| { |
| "epoch": 6.909430438842204, |
| "grad_norm": 3.4688098430633545, |
| "learning_rate": 1.3197793872833735e-05, |
| "loss": 0.1273, |
| "num_input_tokens_seen": 956072, |
| "step": 7400 |
| }, |
| { |
| "epoch": 6.914098972922503, |
| "grad_norm": 7.582507610321045, |
| "learning_rate": 1.316189459132305e-05, |
| "loss": 0.1408, |
| "num_input_tokens_seen": 956680, |
| "step": 7405 |
| }, |
| { |
| "epoch": 6.918767507002801, |
| "grad_norm": 9.396662712097168, |
| "learning_rate": 1.3126026748028843e-05, |
| "loss": 0.2068, |
| "num_input_tokens_seen": 957304, |
| "step": 7410 |
| }, |
| { |
| "epoch": 6.9234360410831, |
| "grad_norm": 1.258137822151184, |
| "learning_rate": 1.3090190438204607e-05, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 957928, |
| "step": 7415 |
| }, |
| { |
| "epoch": 6.928104575163399, |
| "grad_norm": 8.436930656433105, |
| "learning_rate": 1.3054385757020119e-05, |
| "loss": 0.1514, |
| "num_input_tokens_seen": 958600, |
| "step": 7420 |
| }, |
| { |
| "epoch": 6.932773109243698, |
| "grad_norm": 3.9069759845733643, |
| "learning_rate": 1.3018612799561137e-05, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 959240, |
| "step": 7425 |
| }, |
| { |
| "epoch": 6.9374416433239965, |
| "grad_norm": 13.299653053283691, |
| "learning_rate": 1.2982871660829191e-05, |
| "loss": 0.1571, |
| "num_input_tokens_seen": 959976, |
| "step": 7430 |
| }, |
| { |
| "epoch": 6.942110177404295, |
| "grad_norm": 2.508751630783081, |
| "learning_rate": 1.2947162435741278e-05, |
| "loss": 0.0605, |
| "num_input_tokens_seen": 960712, |
| "step": 7435 |
| }, |
| { |
| "epoch": 6.946778711484594, |
| "grad_norm": 9.50046443939209, |
| "learning_rate": 1.2911485219129677e-05, |
| "loss": 0.2175, |
| "num_input_tokens_seen": 961304, |
| "step": 7440 |
| }, |
| { |
| "epoch": 6.951447245564893, |
| "grad_norm": 0.7765032052993774, |
| "learning_rate": 1.2875840105741654e-05, |
| "loss": 0.2403, |
| "num_input_tokens_seen": 961976, |
| "step": 7445 |
| }, |
| { |
| "epoch": 6.956115779645192, |
| "grad_norm": 1.1209977865219116, |
| "learning_rate": 1.2840227190239195e-05, |
| "loss": 0.0854, |
| "num_input_tokens_seen": 962616, |
| "step": 7450 |
| }, |
| { |
| "epoch": 6.96078431372549, |
| "grad_norm": 1.1824790239334106, |
| "learning_rate": 1.2804646567198818e-05, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 963224, |
| "step": 7455 |
| }, |
| { |
| "epoch": 6.965452847805789, |
| "grad_norm": 10.131546974182129, |
| "learning_rate": 1.2769098331111246e-05, |
| "loss": 0.1808, |
| "num_input_tokens_seen": 963736, |
| "step": 7460 |
| }, |
| { |
| "epoch": 6.970121381886088, |
| "grad_norm": 0.2978059649467468, |
| "learning_rate": 1.2733582576381211e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 964328, |
| "step": 7465 |
| }, |
| { |
| "epoch": 6.974789915966387, |
| "grad_norm": 0.7438709735870361, |
| "learning_rate": 1.269809939732719e-05, |
| "loss": 0.131, |
| "num_input_tokens_seen": 964952, |
| "step": 7470 |
| }, |
| { |
| "epoch": 6.979458450046685, |
| "grad_norm": 3.8623740673065186, |
| "learning_rate": 1.2662648888181145e-05, |
| "loss": 0.1148, |
| "num_input_tokens_seen": 965672, |
| "step": 7475 |
| }, |
| { |
| "epoch": 6.984126984126984, |
| "grad_norm": 2.075993061065674, |
| "learning_rate": 1.2627231143088259e-05, |
| "loss": 0.039, |
| "num_input_tokens_seen": 966408, |
| "step": 7480 |
| }, |
| { |
| "epoch": 6.988795518207283, |
| "grad_norm": 2.753314971923828, |
| "learning_rate": 1.2591846256106732e-05, |
| "loss": 0.1244, |
| "num_input_tokens_seen": 967032, |
| "step": 7485 |
| }, |
| { |
| "epoch": 6.993464052287582, |
| "grad_norm": 0.46315088868141174, |
| "learning_rate": 1.255649432120749e-05, |
| "loss": 0.1778, |
| "num_input_tokens_seen": 967592, |
| "step": 7490 |
| }, |
| { |
| "epoch": 6.9981325863678805, |
| "grad_norm": 4.949937343597412, |
| "learning_rate": 1.252117543227394e-05, |
| "loss": 0.0908, |
| "num_input_tokens_seen": 968200, |
| "step": 7495 |
| }, |
| { |
| "epoch": 7.002801120448179, |
| "grad_norm": 0.3900282382965088, |
| "learning_rate": 1.2485889683101758e-05, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 968752, |
| "step": 7500 |
| }, |
| { |
| "epoch": 7.006535947712418, |
| "eval_loss": 0.969052255153656, |
| "eval_runtime": 3.8766, |
| "eval_samples_per_second": 61.394, |
| "eval_steps_per_second": 30.697, |
| "num_input_tokens_seen": 969200, |
| "step": 7504 |
| }, |
| { |
| "epoch": 7.007469654528478, |
| "grad_norm": 1.4273070096969604, |
| "learning_rate": 1.2450637167398571e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 969344, |
| "step": 7505 |
| }, |
| { |
| "epoch": 7.012138188608777, |
| "grad_norm": 16.452985763549805, |
| "learning_rate": 1.2415417978783777e-05, |
| "loss": 0.1801, |
| "num_input_tokens_seen": 969920, |
| "step": 7510 |
| }, |
| { |
| "epoch": 7.016806722689076, |
| "grad_norm": 0.042361099272966385, |
| "learning_rate": 1.2380232210788265e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 970560, |
| "step": 7515 |
| }, |
| { |
| "epoch": 7.021475256769374, |
| "grad_norm": 3.3039093017578125, |
| "learning_rate": 1.2345079956854164e-05, |
| "loss": 0.0695, |
| "num_input_tokens_seen": 971152, |
| "step": 7520 |
| }, |
| { |
| "epoch": 7.026143790849673, |
| "grad_norm": 1.6099435091018677, |
| "learning_rate": 1.2309961310334609e-05, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 971728, |
| "step": 7525 |
| }, |
| { |
| "epoch": 7.030812324929972, |
| "grad_norm": 5.736599922180176, |
| "learning_rate": 1.2274876364493474e-05, |
| "loss": 0.0954, |
| "num_input_tokens_seen": 972384, |
| "step": 7530 |
| }, |
| { |
| "epoch": 7.035480859010271, |
| "grad_norm": 9.741619110107422, |
| "learning_rate": 1.2239825212505124e-05, |
| "loss": 0.0737, |
| "num_input_tokens_seen": 973120, |
| "step": 7535 |
| }, |
| { |
| "epoch": 7.040149393090569, |
| "grad_norm": 2.7313551902770996, |
| "learning_rate": 1.2204807947454203e-05, |
| "loss": 0.1562, |
| "num_input_tokens_seen": 973792, |
| "step": 7540 |
| }, |
| { |
| "epoch": 7.044817927170868, |
| "grad_norm": 4.553571701049805, |
| "learning_rate": 1.2169824662335352e-05, |
| "loss": 0.0597, |
| "num_input_tokens_seen": 974400, |
| "step": 7545 |
| }, |
| { |
| "epoch": 7.049486461251167, |
| "grad_norm": 0.03360239043831825, |
| "learning_rate": 1.2134875450052979e-05, |
| "loss": 0.0293, |
| "num_input_tokens_seen": 974992, |
| "step": 7550 |
| }, |
| { |
| "epoch": 7.054154995331466, |
| "grad_norm": 4.209261417388916, |
| "learning_rate": 1.2099960403420985e-05, |
| "loss": 0.0987, |
| "num_input_tokens_seen": 975568, |
| "step": 7555 |
| }, |
| { |
| "epoch": 7.0588235294117645, |
| "grad_norm": 9.991903305053711, |
| "learning_rate": 1.2065079615162559e-05, |
| "loss": 0.0483, |
| "num_input_tokens_seen": 976288, |
| "step": 7560 |
| }, |
| { |
| "epoch": 7.063492063492063, |
| "grad_norm": 2.156618356704712, |
| "learning_rate": 1.2030233177909896e-05, |
| "loss": 0.0854, |
| "num_input_tokens_seen": 976848, |
| "step": 7565 |
| }, |
| { |
| "epoch": 7.068160597572362, |
| "grad_norm": 5.080526351928711, |
| "learning_rate": 1.1995421184203992e-05, |
| "loss": 0.1016, |
| "num_input_tokens_seen": 977568, |
| "step": 7570 |
| }, |
| { |
| "epoch": 7.072829131652661, |
| "grad_norm": 1.6115537881851196, |
| "learning_rate": 1.196064372649434e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 978240, |
| "step": 7575 |
| }, |
| { |
| "epoch": 7.07749766573296, |
| "grad_norm": 1.5015766620635986, |
| "learning_rate": 1.1925900897138718e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 978928, |
| "step": 7580 |
| }, |
| { |
| "epoch": 7.082166199813258, |
| "grad_norm": 2.4616377353668213, |
| "learning_rate": 1.189119278840296e-05, |
| "loss": 0.0496, |
| "num_input_tokens_seen": 979840, |
| "step": 7585 |
| }, |
| { |
| "epoch": 7.086834733893557, |
| "grad_norm": 18.41492462158203, |
| "learning_rate": 1.1856519492460694e-05, |
| "loss": 0.2292, |
| "num_input_tokens_seen": 980336, |
| "step": 7590 |
| }, |
| { |
| "epoch": 7.091503267973856, |
| "grad_norm": 1.4025119543075562, |
| "learning_rate": 1.1821881101393084e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 980992, |
| "step": 7595 |
| }, |
| { |
| "epoch": 7.096171802054155, |
| "grad_norm": 9.339607238769531, |
| "learning_rate": 1.1787277707188616e-05, |
| "loss": 0.1712, |
| "num_input_tokens_seen": 981760, |
| "step": 7600 |
| }, |
| { |
| "epoch": 7.100840336134453, |
| "grad_norm": 1.5936764478683472, |
| "learning_rate": 1.1752709401742799e-05, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 982352, |
| "step": 7605 |
| }, |
| { |
| "epoch": 7.105508870214752, |
| "grad_norm": 0.4758903682231903, |
| "learning_rate": 1.1718176276858001e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 982960, |
| "step": 7610 |
| }, |
| { |
| "epoch": 7.110177404295051, |
| "grad_norm": 9.756832122802734, |
| "learning_rate": 1.1683678424243122e-05, |
| "loss": 0.0674, |
| "num_input_tokens_seen": 983520, |
| "step": 7615 |
| }, |
| { |
| "epoch": 7.11484593837535, |
| "grad_norm": 3.2350313663482666, |
| "learning_rate": 1.1649215935513422e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 984240, |
| "step": 7620 |
| }, |
| { |
| "epoch": 7.1195144724556485, |
| "grad_norm": 13.398826599121094, |
| "learning_rate": 1.161478890219024e-05, |
| "loss": 0.0716, |
| "num_input_tokens_seen": 984912, |
| "step": 7625 |
| }, |
| { |
| "epoch": 7.124183006535947, |
| "grad_norm": 0.20864218473434448, |
| "learning_rate": 1.1580397415700733e-05, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 985520, |
| "step": 7630 |
| }, |
| { |
| "epoch": 7.128851540616246, |
| "grad_norm": 5.6734299659729, |
| "learning_rate": 1.1546041567377686e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 986176, |
| "step": 7635 |
| }, |
| { |
| "epoch": 7.133520074696546, |
| "grad_norm": 0.6356542110443115, |
| "learning_rate": 1.1511721448459223e-05, |
| "loss": 0.1587, |
| "num_input_tokens_seen": 986768, |
| "step": 7640 |
| }, |
| { |
| "epoch": 7.1381886087768445, |
| "grad_norm": 0.35997679829597473, |
| "learning_rate": 1.14774371500886e-05, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 987376, |
| "step": 7645 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 1.3711936473846436, |
| "learning_rate": 1.1443188763313915e-05, |
| "loss": 0.108, |
| "num_input_tokens_seen": 987936, |
| "step": 7650 |
| }, |
| { |
| "epoch": 7.147525676937442, |
| "grad_norm": 0.11039953678846359, |
| "learning_rate": 1.1408976379087932e-05, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 988592, |
| "step": 7655 |
| }, |
| { |
| "epoch": 7.152194211017741, |
| "grad_norm": 3.1219189167022705, |
| "learning_rate": 1.1374800088267767e-05, |
| "loss": 0.2025, |
| "num_input_tokens_seen": 989168, |
| "step": 7660 |
| }, |
| { |
| "epoch": 7.1568627450980395, |
| "grad_norm": 1.7455995082855225, |
| "learning_rate": 1.1340659981614715e-05, |
| "loss": 0.1152, |
| "num_input_tokens_seen": 989696, |
| "step": 7665 |
| }, |
| { |
| "epoch": 7.161531279178338, |
| "grad_norm": 13.108963966369629, |
| "learning_rate": 1.1306556149793971e-05, |
| "loss": 0.0463, |
| "num_input_tokens_seen": 990288, |
| "step": 7670 |
| }, |
| { |
| "epoch": 7.166199813258637, |
| "grad_norm": 2.7943058013916016, |
| "learning_rate": 1.1272488683374369e-05, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 990880, |
| "step": 7675 |
| }, |
| { |
| "epoch": 7.170868347338936, |
| "grad_norm": 3.618239641189575, |
| "learning_rate": 1.1238457672828204e-05, |
| "loss": 0.0628, |
| "num_input_tokens_seen": 991472, |
| "step": 7680 |
| }, |
| { |
| "epoch": 7.175536881419235, |
| "grad_norm": 1.0060943365097046, |
| "learning_rate": 1.1204463208530936e-05, |
| "loss": 0.1359, |
| "num_input_tokens_seen": 992048, |
| "step": 7685 |
| }, |
| { |
| "epoch": 7.180205415499533, |
| "grad_norm": 0.04717274010181427, |
| "learning_rate": 1.1170505380760984e-05, |
| "loss": 0.229, |
| "num_input_tokens_seen": 992704, |
| "step": 7690 |
| }, |
| { |
| "epoch": 7.184873949579832, |
| "grad_norm": 0.1487988978624344, |
| "learning_rate": 1.1136584279699458e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 993472, |
| "step": 7695 |
| }, |
| { |
| "epoch": 7.189542483660131, |
| "grad_norm": 11.722426414489746, |
| "learning_rate": 1.1102699995429921e-05, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 994064, |
| "step": 7700 |
| }, |
| { |
| "epoch": 7.19421101774043, |
| "grad_norm": 9.349660873413086, |
| "learning_rate": 1.1068852617938196e-05, |
| "loss": 0.2765, |
| "num_input_tokens_seen": 994656, |
| "step": 7705 |
| }, |
| { |
| "epoch": 7.1988795518207285, |
| "grad_norm": 1.1677833795547485, |
| "learning_rate": 1.1035042237112076e-05, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 995312, |
| "step": 7710 |
| }, |
| { |
| "epoch": 7.203548085901027, |
| "grad_norm": 0.6099132895469666, |
| "learning_rate": 1.1001268942741099e-05, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 995984, |
| "step": 7715 |
| }, |
| { |
| "epoch": 7.208216619981326, |
| "grad_norm": 3.9095051288604736, |
| "learning_rate": 1.0967532824516334e-05, |
| "loss": 0.0812, |
| "num_input_tokens_seen": 996624, |
| "step": 7720 |
| }, |
| { |
| "epoch": 7.212885154061625, |
| "grad_norm": 0.8077293634414673, |
| "learning_rate": 1.0933833972030081e-05, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 997184, |
| "step": 7725 |
| }, |
| { |
| "epoch": 7.2175536881419236, |
| "grad_norm": 0.04219364747405052, |
| "learning_rate": 1.0900172474775714e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 997888, |
| "step": 7730 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 4.558381080627441, |
| "learning_rate": 1.086654842214739e-05, |
| "loss": 0.0981, |
| "num_input_tokens_seen": 998496, |
| "step": 7735 |
| }, |
| { |
| "epoch": 7.226890756302521, |
| "grad_norm": 4.1472086906433105, |
| "learning_rate": 1.0832961903439815e-05, |
| "loss": 0.1938, |
| "num_input_tokens_seen": 999136, |
| "step": 7740 |
| }, |
| { |
| "epoch": 7.23155929038282, |
| "grad_norm": 1.4830154180526733, |
| "learning_rate": 1.0799413007848039e-05, |
| "loss": 0.1462, |
| "num_input_tokens_seen": 999728, |
| "step": 7745 |
| }, |
| { |
| "epoch": 7.236227824463119, |
| "grad_norm": 0.15791846811771393, |
| "learning_rate": 1.0765901824467167e-05, |
| "loss": 0.0642, |
| "num_input_tokens_seen": 1000384, |
| "step": 7750 |
| }, |
| { |
| "epoch": 7.240896358543417, |
| "grad_norm": 7.790059566497803, |
| "learning_rate": 1.0732428442292174e-05, |
| "loss": 0.1602, |
| "num_input_tokens_seen": 1001088, |
| "step": 7755 |
| }, |
| { |
| "epoch": 7.245564892623716, |
| "grad_norm": 1.7139387130737305, |
| "learning_rate": 1.0698992950217649e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 1001808, |
| "step": 7760 |
| }, |
| { |
| "epoch": 7.250233426704015, |
| "grad_norm": 6.180329322814941, |
| "learning_rate": 1.0665595437037545e-05, |
| "loss": 0.5134, |
| "num_input_tokens_seen": 1002560, |
| "step": 7765 |
| }, |
| { |
| "epoch": 7.254901960784314, |
| "grad_norm": 0.14951446652412415, |
| "learning_rate": 1.0632235991444972e-05, |
| "loss": 0.0613, |
| "num_input_tokens_seen": 1003200, |
| "step": 7770 |
| }, |
| { |
| "epoch": 7.2595704948646125, |
| "grad_norm": 0.27165699005126953, |
| "learning_rate": 1.0598914702031923e-05, |
| "loss": 0.0963, |
| "num_input_tokens_seen": 1003776, |
| "step": 7775 |
| }, |
| { |
| "epoch": 7.264239028944911, |
| "grad_norm": 2.8281211853027344, |
| "learning_rate": 1.0565631657289064e-05, |
| "loss": 0.1009, |
| "num_input_tokens_seen": 1004480, |
| "step": 7780 |
| }, |
| { |
| "epoch": 7.26890756302521, |
| "grad_norm": 0.6824027895927429, |
| "learning_rate": 1.0532386945605508e-05, |
| "loss": 0.0515, |
| "num_input_tokens_seen": 1005120, |
| "step": 7785 |
| }, |
| { |
| "epoch": 7.273576097105509, |
| "grad_norm": 0.21875107288360596, |
| "learning_rate": 1.0499180655268562e-05, |
| "loss": 0.0814, |
| "num_input_tokens_seen": 1005712, |
| "step": 7790 |
| }, |
| { |
| "epoch": 7.278244631185808, |
| "grad_norm": 7.165660858154297, |
| "learning_rate": 1.0466012874463507e-05, |
| "loss": 0.1297, |
| "num_input_tokens_seen": 1006400, |
| "step": 7795 |
| }, |
| { |
| "epoch": 7.282913165266106, |
| "grad_norm": 0.6695095896720886, |
| "learning_rate": 1.0432883691273329e-05, |
| "loss": 0.0837, |
| "num_input_tokens_seen": 1007024, |
| "step": 7800 |
| }, |
| { |
| "epoch": 7.287581699346405, |
| "grad_norm": 1.1955631971359253, |
| "learning_rate": 1.039979319367854e-05, |
| "loss": 0.0841, |
| "num_input_tokens_seen": 1007744, |
| "step": 7805 |
| }, |
| { |
| "epoch": 7.292250233426704, |
| "grad_norm": 2.22499942779541, |
| "learning_rate": 1.0366741469556906e-05, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 1008352, |
| "step": 7810 |
| }, |
| { |
| "epoch": 7.296918767507003, |
| "grad_norm": 2.4868786334991455, |
| "learning_rate": 1.0333728606683204e-05, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 1009008, |
| "step": 7815 |
| }, |
| { |
| "epoch": 7.301587301587301, |
| "grad_norm": 5.993730068206787, |
| "learning_rate": 1.0300754692729047e-05, |
| "loss": 0.0993, |
| "num_input_tokens_seen": 1009600, |
| "step": 7820 |
| }, |
| { |
| "epoch": 7.3062558356676, |
| "grad_norm": 3.345144510269165, |
| "learning_rate": 1.026781981526257e-05, |
| "loss": 0.0906, |
| "num_input_tokens_seen": 1010224, |
| "step": 7825 |
| }, |
| { |
| "epoch": 7.310924369747899, |
| "grad_norm": 3.8627970218658447, |
| "learning_rate": 1.0234924061748263e-05, |
| "loss": 0.1081, |
| "num_input_tokens_seen": 1010896, |
| "step": 7830 |
| }, |
| { |
| "epoch": 7.315592903828198, |
| "grad_norm": 15.3854341506958, |
| "learning_rate": 1.0202067519546718e-05, |
| "loss": 0.1311, |
| "num_input_tokens_seen": 1011536, |
| "step": 7835 |
| }, |
| { |
| "epoch": 7.3202614379084965, |
| "grad_norm": 3.0586917400360107, |
| "learning_rate": 1.0169250275914394e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 1012208, |
| "step": 7840 |
| }, |
| { |
| "epoch": 7.324929971988795, |
| "grad_norm": 2.1076900959014893, |
| "learning_rate": 1.0136472418003362e-05, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 1012944, |
| "step": 7845 |
| }, |
| { |
| "epoch": 7.329598506069094, |
| "grad_norm": 6.537927627563477, |
| "learning_rate": 1.0103734032861123e-05, |
| "loss": 0.0968, |
| "num_input_tokens_seen": 1013568, |
| "step": 7850 |
| }, |
| { |
| "epoch": 7.334267040149393, |
| "grad_norm": 4.479717254638672, |
| "learning_rate": 1.0071035207430352e-05, |
| "loss": 0.0535, |
| "num_input_tokens_seen": 1014240, |
| "step": 7855 |
| }, |
| { |
| "epoch": 7.338935574229692, |
| "grad_norm": 1.1037297248840332, |
| "learning_rate": 1.0038376028548637e-05, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 1014864, |
| "step": 7860 |
| }, |
| { |
| "epoch": 7.34360410830999, |
| "grad_norm": 1.1650564670562744, |
| "learning_rate": 1.0005756582948316e-05, |
| "loss": 0.1159, |
| "num_input_tokens_seen": 1015520, |
| "step": 7865 |
| }, |
| { |
| "epoch": 7.348272642390289, |
| "grad_norm": 0.03593031316995621, |
| "learning_rate": 9.973176957256175e-06, |
| "loss": 0.145, |
| "num_input_tokens_seen": 1016112, |
| "step": 7870 |
| }, |
| { |
| "epoch": 7.352941176470588, |
| "grad_norm": 3.5073347091674805, |
| "learning_rate": 9.940637237993269e-06, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 1016752, |
| "step": 7875 |
| }, |
| { |
| "epoch": 7.357609710550887, |
| "grad_norm": 1.1457678079605103, |
| "learning_rate": 9.908137511574675e-06, |
| "loss": 0.0701, |
| "num_input_tokens_seen": 1017344, |
| "step": 7880 |
| }, |
| { |
| "epoch": 7.362278244631185, |
| "grad_norm": 8.913501739501953, |
| "learning_rate": 9.875677864309255e-06, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 1017984, |
| "step": 7885 |
| }, |
| { |
| "epoch": 7.366946778711484, |
| "grad_norm": 1.648511290550232, |
| "learning_rate": 9.843258382399442e-06, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 1018592, |
| "step": 7890 |
| }, |
| { |
| "epoch": 7.371615312791784, |
| "grad_norm": 3.110186815261841, |
| "learning_rate": 9.810879151940982e-06, |
| "loss": 0.06, |
| "num_input_tokens_seen": 1019264, |
| "step": 7895 |
| }, |
| { |
| "epoch": 7.376283846872083, |
| "grad_norm": 4.371214866638184, |
| "learning_rate": 9.778540258922762e-06, |
| "loss": 0.1029, |
| "num_input_tokens_seen": 1019856, |
| "step": 7900 |
| }, |
| { |
| "epoch": 7.380952380952381, |
| "grad_norm": 20.6632137298584, |
| "learning_rate": 9.746241789226502e-06, |
| "loss": 0.1593, |
| "num_input_tokens_seen": 1020432, |
| "step": 7905 |
| }, |
| { |
| "epoch": 7.38562091503268, |
| "grad_norm": 38.212284088134766, |
| "learning_rate": 9.7139838286266e-06, |
| "loss": 0.6516, |
| "num_input_tokens_seen": 1021056, |
| "step": 7910 |
| }, |
| { |
| "epoch": 7.390289449112979, |
| "grad_norm": 0.10892492532730103, |
| "learning_rate": 9.681766462789883e-06, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 1021696, |
| "step": 7915 |
| }, |
| { |
| "epoch": 7.394957983193278, |
| "grad_norm": 7.932071208953857, |
| "learning_rate": 9.649589777275334e-06, |
| "loss": 0.1172, |
| "num_input_tokens_seen": 1022304, |
| "step": 7920 |
| }, |
| { |
| "epoch": 7.3996265172735765, |
| "grad_norm": 3.0598278045654297, |
| "learning_rate": 9.617453857533934e-06, |
| "loss": 0.1303, |
| "num_input_tokens_seen": 1023040, |
| "step": 7925 |
| }, |
| { |
| "epoch": 7.404295051353875, |
| "grad_norm": 6.430777072906494, |
| "learning_rate": 9.585358788908394e-06, |
| "loss": 0.1392, |
| "num_input_tokens_seen": 1023632, |
| "step": 7930 |
| }, |
| { |
| "epoch": 7.408963585434174, |
| "grad_norm": 7.087793350219727, |
| "learning_rate": 9.553304656632944e-06, |
| "loss": 0.1354, |
| "num_input_tokens_seen": 1024304, |
| "step": 7935 |
| }, |
| { |
| "epoch": 7.413632119514473, |
| "grad_norm": 8.742671966552734, |
| "learning_rate": 9.521291545833086e-06, |
| "loss": 0.1999, |
| "num_input_tokens_seen": 1024960, |
| "step": 7940 |
| }, |
| { |
| "epoch": 7.4183006535947715, |
| "grad_norm": 6.230234622955322, |
| "learning_rate": 9.489319541525383e-06, |
| "loss": 0.1121, |
| "num_input_tokens_seen": 1025536, |
| "step": 7945 |
| }, |
| { |
| "epoch": 7.42296918767507, |
| "grad_norm": 10.568910598754883, |
| "learning_rate": 9.457388728617239e-06, |
| "loss": 0.106, |
| "num_input_tokens_seen": 1026352, |
| "step": 7950 |
| }, |
| { |
| "epoch": 7.427637721755369, |
| "grad_norm": 2.8674209117889404, |
| "learning_rate": 9.425499191906675e-06, |
| "loss": 0.1084, |
| "num_input_tokens_seen": 1026992, |
| "step": 7955 |
| }, |
| { |
| "epoch": 7.432306255835668, |
| "grad_norm": 1.4187448024749756, |
| "learning_rate": 9.393651016082083e-06, |
| "loss": 0.1247, |
| "num_input_tokens_seen": 1027600, |
| "step": 7960 |
| }, |
| { |
| "epoch": 7.436974789915967, |
| "grad_norm": 0.17856673896312714, |
| "learning_rate": 9.361844285722027e-06, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 1028192, |
| "step": 7965 |
| }, |
| { |
| "epoch": 7.441643323996265, |
| "grad_norm": 0.6200303435325623, |
| "learning_rate": 9.33007908529498e-06, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 1028736, |
| "step": 7970 |
| }, |
| { |
| "epoch": 7.446311858076564, |
| "grad_norm": 0.7025942802429199, |
| "learning_rate": 9.298355499159156e-06, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 1029456, |
| "step": 7975 |
| }, |
| { |
| "epoch": 7.450980392156863, |
| "grad_norm": 13.140531539916992, |
| "learning_rate": 9.266673611562221e-06, |
| "loss": 0.0942, |
| "num_input_tokens_seen": 1030160, |
| "step": 7980 |
| }, |
| { |
| "epoch": 7.455648926237162, |
| "grad_norm": 4.14353084564209, |
| "learning_rate": 9.23503350664113e-06, |
| "loss": 0.1122, |
| "num_input_tokens_seen": 1030832, |
| "step": 7985 |
| }, |
| { |
| "epoch": 7.4603174603174605, |
| "grad_norm": 5.697788715362549, |
| "learning_rate": 9.203435268421881e-06, |
| "loss": 0.1073, |
| "num_input_tokens_seen": 1031456, |
| "step": 7990 |
| }, |
| { |
| "epoch": 7.464985994397759, |
| "grad_norm": 5.6900129318237305, |
| "learning_rate": 9.171878980819254e-06, |
| "loss": 0.0624, |
| "num_input_tokens_seen": 1032064, |
| "step": 7995 |
| }, |
| { |
| "epoch": 7.469654528478058, |
| "grad_norm": 2.6561965942382812, |
| "learning_rate": 9.140364727636651e-06, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 1032656, |
| "step": 8000 |
| }, |
| { |
| "epoch": 7.474323062558357, |
| "grad_norm": 7.419023036956787, |
| "learning_rate": 9.108892592565837e-06, |
| "loss": 0.0905, |
| "num_input_tokens_seen": 1033280, |
| "step": 8005 |
| }, |
| { |
| "epoch": 7.4789915966386555, |
| "grad_norm": 0.4191058278083801, |
| "learning_rate": 9.077462659186728e-06, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 1033920, |
| "step": 8010 |
| }, |
| { |
| "epoch": 7.483660130718954, |
| "grad_norm": 5.563292503356934, |
| "learning_rate": 9.046075010967145e-06, |
| "loss": 0.0657, |
| "num_input_tokens_seen": 1034560, |
| "step": 8015 |
| }, |
| { |
| "epoch": 7.488328664799253, |
| "grad_norm": 0.4257217049598694, |
| "learning_rate": 9.014729731262647e-06, |
| "loss": 0.0997, |
| "num_input_tokens_seen": 1035312, |
| "step": 8020 |
| }, |
| { |
| "epoch": 7.492997198879552, |
| "grad_norm": 0.023411273956298828, |
| "learning_rate": 8.983426903316242e-06, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 1035888, |
| "step": 8025 |
| }, |
| { |
| "epoch": 7.497665732959851, |
| "grad_norm": 8.618074417114258, |
| "learning_rate": 8.95216661025822e-06, |
| "loss": 0.2303, |
| "num_input_tokens_seen": 1036512, |
| "step": 8030 |
| }, |
| { |
| "epoch": 7.502334267040149, |
| "grad_norm": 4.378950595855713, |
| "learning_rate": 8.92094893510592e-06, |
| "loss": 0.102, |
| "num_input_tokens_seen": 1037248, |
| "step": 8035 |
| }, |
| { |
| "epoch": 7.507002801120448, |
| "grad_norm": 8.568061828613281, |
| "learning_rate": 8.889773960763465e-06, |
| "loss": 0.1088, |
| "num_input_tokens_seen": 1037856, |
| "step": 8040 |
| }, |
| { |
| "epoch": 7.507002801120448, |
| "eval_loss": 1.077528476715088, |
| "eval_runtime": 3.8754, |
| "eval_samples_per_second": 61.414, |
| "eval_steps_per_second": 30.707, |
| "num_input_tokens_seen": 1037856, |
| "step": 8040 |
| }, |
| { |
| "epoch": 7.511671335200747, |
| "grad_norm": 9.698957443237305, |
| "learning_rate": 8.858641770021619e-06, |
| "loss": 0.1022, |
| "num_input_tokens_seen": 1038496, |
| "step": 8045 |
| }, |
| { |
| "epoch": 7.516339869281046, |
| "grad_norm": 4.305899620056152, |
| "learning_rate": 8.827552445557505e-06, |
| "loss": 0.2151, |
| "num_input_tokens_seen": 1039136, |
| "step": 8050 |
| }, |
| { |
| "epoch": 7.5210084033613445, |
| "grad_norm": 0.05098443478345871, |
| "learning_rate": 8.79650606993442e-06, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 1039728, |
| "step": 8055 |
| }, |
| { |
| "epoch": 7.525676937441643, |
| "grad_norm": 3.4601404666900635, |
| "learning_rate": 8.765502725601582e-06, |
| "loss": 0.059, |
| "num_input_tokens_seen": 1040384, |
| "step": 8060 |
| }, |
| { |
| "epoch": 7.530345471521942, |
| "grad_norm": 0.7538869380950928, |
| "learning_rate": 8.734542494893955e-06, |
| "loss": 0.0568, |
| "num_input_tokens_seen": 1040976, |
| "step": 8065 |
| }, |
| { |
| "epoch": 7.535014005602241, |
| "grad_norm": 0.21339933574199677, |
| "learning_rate": 8.70362546003198e-06, |
| "loss": 0.0355, |
| "num_input_tokens_seen": 1041632, |
| "step": 8070 |
| }, |
| { |
| "epoch": 7.5396825396825395, |
| "grad_norm": 0.22579693794250488, |
| "learning_rate": 8.67275170312141e-06, |
| "loss": 0.1002, |
| "num_input_tokens_seen": 1042256, |
| "step": 8075 |
| }, |
| { |
| "epoch": 7.544351073762838, |
| "grad_norm": 15.217583656311035, |
| "learning_rate": 8.641921306153052e-06, |
| "loss": 0.0909, |
| "num_input_tokens_seen": 1042928, |
| "step": 8080 |
| }, |
| { |
| "epoch": 7.549019607843137, |
| "grad_norm": 4.866734981536865, |
| "learning_rate": 8.611134351002579e-06, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 1043504, |
| "step": 8085 |
| }, |
| { |
| "epoch": 7.553688141923436, |
| "grad_norm": 3.1651668548583984, |
| "learning_rate": 8.580390919430264e-06, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 1044192, |
| "step": 8090 |
| }, |
| { |
| "epoch": 7.558356676003735, |
| "grad_norm": 1.9329001903533936, |
| "learning_rate": 8.549691093080822e-06, |
| "loss": 0.0826, |
| "num_input_tokens_seen": 1044800, |
| "step": 8095 |
| }, |
| { |
| "epoch": 7.563025210084033, |
| "grad_norm": 9.483233451843262, |
| "learning_rate": 8.519034953483171e-06, |
| "loss": 0.1911, |
| "num_input_tokens_seen": 1045488, |
| "step": 8100 |
| }, |
| { |
| "epoch": 7.567693744164332, |
| "grad_norm": 6.546195030212402, |
| "learning_rate": 8.488422582050182e-06, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 1046112, |
| "step": 8105 |
| }, |
| { |
| "epoch": 7.572362278244631, |
| "grad_norm": 6.966008186340332, |
| "learning_rate": 8.45785406007852e-06, |
| "loss": 0.0977, |
| "num_input_tokens_seen": 1046768, |
| "step": 8110 |
| }, |
| { |
| "epoch": 7.57703081232493, |
| "grad_norm": 0.8122055530548096, |
| "learning_rate": 8.42732946874838e-06, |
| "loss": 0.0274, |
| "num_input_tokens_seen": 1047424, |
| "step": 8115 |
| }, |
| { |
| "epoch": 7.5816993464052285, |
| "grad_norm": 0.033146731555461884, |
| "learning_rate": 8.396848889123304e-06, |
| "loss": 0.1072, |
| "num_input_tokens_seen": 1048048, |
| "step": 8120 |
| }, |
| { |
| "epoch": 7.586367880485527, |
| "grad_norm": 1.4818317890167236, |
| "learning_rate": 8.366412402149954e-06, |
| "loss": 0.0435, |
| "num_input_tokens_seen": 1048720, |
| "step": 8125 |
| }, |
| { |
| "epoch": 7.591036414565826, |
| "grad_norm": 1.447939395904541, |
| "learning_rate": 8.336020088657884e-06, |
| "loss": 0.1128, |
| "num_input_tokens_seen": 1049424, |
| "step": 8130 |
| }, |
| { |
| "epoch": 7.595704948646125, |
| "grad_norm": 10.602599143981934, |
| "learning_rate": 8.305672029359357e-06, |
| "loss": 0.0777, |
| "num_input_tokens_seen": 1050016, |
| "step": 8135 |
| }, |
| { |
| "epoch": 7.6003734827264235, |
| "grad_norm": 1.1130367517471313, |
| "learning_rate": 8.27536830484909e-06, |
| "loss": 0.1002, |
| "num_input_tokens_seen": 1050608, |
| "step": 8140 |
| }, |
| { |
| "epoch": 7.605042016806722, |
| "grad_norm": 0.2835767865180969, |
| "learning_rate": 8.245108995604061e-06, |
| "loss": 0.0834, |
| "num_input_tokens_seen": 1051216, |
| "step": 8145 |
| }, |
| { |
| "epoch": 7.609710550887021, |
| "grad_norm": 1.318534016609192, |
| "learning_rate": 8.214894181983314e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1052000, |
| "step": 8150 |
| }, |
| { |
| "epoch": 7.61437908496732, |
| "grad_norm": 12.885905265808105, |
| "learning_rate": 8.184723944227717e-06, |
| "loss": 0.1562, |
| "num_input_tokens_seen": 1052624, |
| "step": 8155 |
| }, |
| { |
| "epoch": 7.619047619047619, |
| "grad_norm": 5.901303768157959, |
| "learning_rate": 8.154598362459765e-06, |
| "loss": 0.1439, |
| "num_input_tokens_seen": 1053344, |
| "step": 8160 |
| }, |
| { |
| "epoch": 7.623716153127917, |
| "grad_norm": 6.216315269470215, |
| "learning_rate": 8.124517516683337e-06, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 1053920, |
| "step": 8165 |
| }, |
| { |
| "epoch": 7.628384687208216, |
| "grad_norm": 1.9915357828140259, |
| "learning_rate": 8.094481486783534e-06, |
| "loss": 0.108, |
| "num_input_tokens_seen": 1054656, |
| "step": 8170 |
| }, |
| { |
| "epoch": 7.633053221288515, |
| "grad_norm": 4.136562824249268, |
| "learning_rate": 8.064490352526432e-06, |
| "loss": 0.1449, |
| "num_input_tokens_seen": 1055328, |
| "step": 8175 |
| }, |
| { |
| "epoch": 7.637721755368814, |
| "grad_norm": 0.12521328032016754, |
| "learning_rate": 8.034544193558888e-06, |
| "loss": 0.0834, |
| "num_input_tokens_seen": 1055920, |
| "step": 8180 |
| }, |
| { |
| "epoch": 7.642390289449113, |
| "grad_norm": 2.289386034011841, |
| "learning_rate": 8.0046430894083e-06, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 1056496, |
| "step": 8185 |
| }, |
| { |
| "epoch": 7.647058823529412, |
| "grad_norm": 8.30339527130127, |
| "learning_rate": 7.974787119482416e-06, |
| "loss": 0.1128, |
| "num_input_tokens_seen": 1057152, |
| "step": 8190 |
| }, |
| { |
| "epoch": 7.651727357609711, |
| "grad_norm": 4.268256187438965, |
| "learning_rate": 7.944976363069137e-06, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 1057776, |
| "step": 8195 |
| }, |
| { |
| "epoch": 7.65639589169001, |
| "grad_norm": 0.635925829410553, |
| "learning_rate": 7.915210899336284e-06, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 1058432, |
| "step": 8200 |
| }, |
| { |
| "epoch": 7.661064425770308, |
| "grad_norm": 0.12795297801494598, |
| "learning_rate": 7.885490807331405e-06, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 1059216, |
| "step": 8205 |
| }, |
| { |
| "epoch": 7.665732959850607, |
| "grad_norm": 0.22869738936424255, |
| "learning_rate": 7.855816165981528e-06, |
| "loss": 0.0965, |
| "num_input_tokens_seen": 1059792, |
| "step": 8210 |
| }, |
| { |
| "epoch": 7.670401493930906, |
| "grad_norm": 3.5849406719207764, |
| "learning_rate": 7.826187054093004e-06, |
| "loss": 0.0409, |
| "num_input_tokens_seen": 1060464, |
| "step": 8215 |
| }, |
| { |
| "epoch": 7.675070028011205, |
| "grad_norm": 0.6717523336410522, |
| "learning_rate": 7.796603550351276e-06, |
| "loss": 0.0769, |
| "num_input_tokens_seen": 1061152, |
| "step": 8220 |
| }, |
| { |
| "epoch": 7.6797385620915035, |
| "grad_norm": 3.960289716720581, |
| "learning_rate": 7.767065733320636e-06, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 1061792, |
| "step": 8225 |
| }, |
| { |
| "epoch": 7.684407096171802, |
| "grad_norm": 3.6754231452941895, |
| "learning_rate": 7.737573681444082e-06, |
| "loss": 0.0808, |
| "num_input_tokens_seen": 1062512, |
| "step": 8230 |
| }, |
| { |
| "epoch": 7.689075630252101, |
| "grad_norm": 0.45391157269477844, |
| "learning_rate": 7.708127473043044e-06, |
| "loss": 0.097, |
| "num_input_tokens_seen": 1063184, |
| "step": 8235 |
| }, |
| { |
| "epoch": 7.6937441643324, |
| "grad_norm": 12.55863094329834, |
| "learning_rate": 7.678727186317225e-06, |
| "loss": 0.1237, |
| "num_input_tokens_seen": 1063776, |
| "step": 8240 |
| }, |
| { |
| "epoch": 7.698412698412699, |
| "grad_norm": 17.174266815185547, |
| "learning_rate": 7.649372899344376e-06, |
| "loss": 0.2717, |
| "num_input_tokens_seen": 1064384, |
| "step": 8245 |
| }, |
| { |
| "epoch": 7.703081232492997, |
| "grad_norm": 0.3060591518878937, |
| "learning_rate": 7.620064690080076e-06, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 1064944, |
| "step": 8250 |
| }, |
| { |
| "epoch": 7.707749766573296, |
| "grad_norm": 3.2448484897613525, |
| "learning_rate": 7.59080263635755e-06, |
| "loss": 0.077, |
| "num_input_tokens_seen": 1065584, |
| "step": 8255 |
| }, |
| { |
| "epoch": 7.712418300653595, |
| "grad_norm": 1.3086873292922974, |
| "learning_rate": 7.561586815887428e-06, |
| "loss": 0.0542, |
| "num_input_tokens_seen": 1066320, |
| "step": 8260 |
| }, |
| { |
| "epoch": 7.717086834733894, |
| "grad_norm": 4.001552581787109, |
| "learning_rate": 7.532417306257589e-06, |
| "loss": 0.1468, |
| "num_input_tokens_seen": 1066992, |
| "step": 8265 |
| }, |
| { |
| "epoch": 7.721755368814192, |
| "grad_norm": 0.9954704642295837, |
| "learning_rate": 7.503294184932888e-06, |
| "loss": 0.0911, |
| "num_input_tokens_seen": 1067664, |
| "step": 8270 |
| }, |
| { |
| "epoch": 7.726423902894491, |
| "grad_norm": 0.2005486637353897, |
| "learning_rate": 7.474217529255018e-06, |
| "loss": 0.0434, |
| "num_input_tokens_seen": 1068304, |
| "step": 8275 |
| }, |
| { |
| "epoch": 7.73109243697479, |
| "grad_norm": 1.2900813817977905, |
| "learning_rate": 7.44518741644227e-06, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 1068928, |
| "step": 8280 |
| }, |
| { |
| "epoch": 7.735760971055089, |
| "grad_norm": 0.8297515511512756, |
| "learning_rate": 7.416203923589312e-06, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 1069680, |
| "step": 8285 |
| }, |
| { |
| "epoch": 7.7404295051353875, |
| "grad_norm": 2.7593870162963867, |
| "learning_rate": 7.387267127667028e-06, |
| "loss": 0.1607, |
| "num_input_tokens_seen": 1070368, |
| "step": 8290 |
| }, |
| { |
| "epoch": 7.745098039215686, |
| "grad_norm": 1.2917249202728271, |
| "learning_rate": 7.358377105522276e-06, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 1071056, |
| "step": 8295 |
| }, |
| { |
| "epoch": 7.749766573295985, |
| "grad_norm": 2.100257396697998, |
| "learning_rate": 7.329533933877713e-06, |
| "loss": 0.1192, |
| "num_input_tokens_seen": 1071728, |
| "step": 8300 |
| }, |
| { |
| "epoch": 7.754435107376284, |
| "grad_norm": 0.5423152446746826, |
| "learning_rate": 7.300737689331555e-06, |
| "loss": 0.1131, |
| "num_input_tokens_seen": 1072368, |
| "step": 8305 |
| }, |
| { |
| "epoch": 7.759103641456583, |
| "grad_norm": 0.27788621187210083, |
| "learning_rate": 7.2719884483573975e-06, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 1073056, |
| "step": 8310 |
| }, |
| { |
| "epoch": 7.763772175536881, |
| "grad_norm": 3.2515792846679688, |
| "learning_rate": 7.243286287304024e-06, |
| "loss": 0.1105, |
| "num_input_tokens_seen": 1073680, |
| "step": 8315 |
| }, |
| { |
| "epoch": 7.76844070961718, |
| "grad_norm": 6.515419006347656, |
| "learning_rate": 7.214631282395184e-06, |
| "loss": 0.1254, |
| "num_input_tokens_seen": 1074288, |
| "step": 8320 |
| }, |
| { |
| "epoch": 7.773109243697479, |
| "grad_norm": 0.06773083657026291, |
| "learning_rate": 7.186023509729392e-06, |
| "loss": 0.1299, |
| "num_input_tokens_seen": 1074944, |
| "step": 8325 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 0.05192798748612404, |
| "learning_rate": 7.157463045279736e-06, |
| "loss": 0.1067, |
| "num_input_tokens_seen": 1075552, |
| "step": 8330 |
| }, |
| { |
| "epoch": 7.7824463118580764, |
| "grad_norm": 4.573253631591797, |
| "learning_rate": 7.128949964893647e-06, |
| "loss": 0.1099, |
| "num_input_tokens_seen": 1076192, |
| "step": 8335 |
| }, |
| { |
| "epoch": 7.787114845938375, |
| "grad_norm": 3.051204204559326, |
| "learning_rate": 7.100484344292743e-06, |
| "loss": 0.2794, |
| "num_input_tokens_seen": 1076880, |
| "step": 8340 |
| }, |
| { |
| "epoch": 7.791783380018674, |
| "grad_norm": 0.13514143228530884, |
| "learning_rate": 7.072066259072602e-06, |
| "loss": 0.0938, |
| "num_input_tokens_seen": 1077616, |
| "step": 8345 |
| }, |
| { |
| "epoch": 7.796451914098973, |
| "grad_norm": 2.7879085540771484, |
| "learning_rate": 7.043695784702553e-06, |
| "loss": 0.0664, |
| "num_input_tokens_seen": 1078288, |
| "step": 8350 |
| }, |
| { |
| "epoch": 7.8011204481792715, |
| "grad_norm": 1.6922277212142944, |
| "learning_rate": 7.015372996525477e-06, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 1078896, |
| "step": 8355 |
| }, |
| { |
| "epoch": 7.80578898225957, |
| "grad_norm": 0.956597626209259, |
| "learning_rate": 6.987097969757636e-06, |
| "loss": 0.0687, |
| "num_input_tokens_seen": 1079664, |
| "step": 8360 |
| }, |
| { |
| "epoch": 7.810457516339869, |
| "grad_norm": 0.3348096013069153, |
| "learning_rate": 6.958870779488447e-06, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1080304, |
| "step": 8365 |
| }, |
| { |
| "epoch": 7.815126050420168, |
| "grad_norm": 10.991283416748047, |
| "learning_rate": 6.930691500680289e-06, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 1080992, |
| "step": 8370 |
| }, |
| { |
| "epoch": 7.819794584500467, |
| "grad_norm": 0.2432507872581482, |
| "learning_rate": 6.902560208168304e-06, |
| "loss": 0.1369, |
| "num_input_tokens_seen": 1081664, |
| "step": 8375 |
| }, |
| { |
| "epoch": 7.824463118580765, |
| "grad_norm": 3.725372791290283, |
| "learning_rate": 6.8744769766601854e-06, |
| "loss": 0.2358, |
| "num_input_tokens_seen": 1082224, |
| "step": 8380 |
| }, |
| { |
| "epoch": 7.829131652661064, |
| "grad_norm": 0.3020375669002533, |
| "learning_rate": 6.8464418807360095e-06, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 1082832, |
| "step": 8385 |
| }, |
| { |
| "epoch": 7.833800186741363, |
| "grad_norm": 0.009726503863930702, |
| "learning_rate": 6.818454994848006e-06, |
| "loss": 0.1, |
| "num_input_tokens_seen": 1083424, |
| "step": 8390 |
| }, |
| { |
| "epoch": 7.838468720821662, |
| "grad_norm": 10.553521156311035, |
| "learning_rate": 6.7905163933203785e-06, |
| "loss": 0.1198, |
| "num_input_tokens_seen": 1084144, |
| "step": 8395 |
| }, |
| { |
| "epoch": 7.8431372549019605, |
| "grad_norm": 0.8017410039901733, |
| "learning_rate": 6.762626150349119e-06, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 1084816, |
| "step": 8400 |
| }, |
| { |
| "epoch": 7.847805788982259, |
| "grad_norm": 6.144360065460205, |
| "learning_rate": 6.7347843400017625e-06, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 1085424, |
| "step": 8405 |
| }, |
| { |
| "epoch": 7.852474323062558, |
| "grad_norm": 6.126713275909424, |
| "learning_rate": 6.7069910362172474e-06, |
| "loss": 0.074, |
| "num_input_tokens_seen": 1086096, |
| "step": 8410 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 1.0252084732055664, |
| "learning_rate": 6.679246312805687e-06, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 1086816, |
| "step": 8415 |
| }, |
| { |
| "epoch": 7.861811391223156, |
| "grad_norm": 0.3098870813846588, |
| "learning_rate": 6.651550243448182e-06, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 1087408, |
| "step": 8420 |
| }, |
| { |
| "epoch": 7.866479925303455, |
| "grad_norm": 1.8085873126983643, |
| "learning_rate": 6.62390290169663e-06, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 1088112, |
| "step": 8425 |
| }, |
| { |
| "epoch": 7.871148459383754, |
| "grad_norm": 4.327649116516113, |
| "learning_rate": 6.596304360973504e-06, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 1088704, |
| "step": 8430 |
| }, |
| { |
| "epoch": 7.875816993464053, |
| "grad_norm": 0.09986984729766846, |
| "learning_rate": 6.568754694571685e-06, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 1089328, |
| "step": 8435 |
| }, |
| { |
| "epoch": 7.8804855275443515, |
| "grad_norm": 6.41303825378418, |
| "learning_rate": 6.541253975654271e-06, |
| "loss": 0.0485, |
| "num_input_tokens_seen": 1089984, |
| "step": 8440 |
| }, |
| { |
| "epoch": 7.88515406162465, |
| "grad_norm": 1.0045740604400635, |
| "learning_rate": 6.513802277254363e-06, |
| "loss": 0.1427, |
| "num_input_tokens_seen": 1090576, |
| "step": 8445 |
| }, |
| { |
| "epoch": 7.889822595704949, |
| "grad_norm": 0.6578733325004578, |
| "learning_rate": 6.48639967227489e-06, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 1091296, |
| "step": 8450 |
| }, |
| { |
| "epoch": 7.894491129785248, |
| "grad_norm": 1.3396340608596802, |
| "learning_rate": 6.459046233488372e-06, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 1091984, |
| "step": 8455 |
| }, |
| { |
| "epoch": 7.899159663865547, |
| "grad_norm": 4.178629398345947, |
| "learning_rate": 6.431742033536797e-06, |
| "loss": 0.101, |
| "num_input_tokens_seen": 1092608, |
| "step": 8460 |
| }, |
| { |
| "epoch": 7.903828197945845, |
| "grad_norm": 0.4377411901950836, |
| "learning_rate": 6.404487144931379e-06, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 1093296, |
| "step": 8465 |
| }, |
| { |
| "epoch": 7.908496732026144, |
| "grad_norm": 11.185189247131348, |
| "learning_rate": 6.377281640052357e-06, |
| "loss": 0.1267, |
| "num_input_tokens_seen": 1093952, |
| "step": 8470 |
| }, |
| { |
| "epoch": 7.913165266106443, |
| "grad_norm": 0.6686238646507263, |
| "learning_rate": 6.3501255911488565e-06, |
| "loss": 0.2006, |
| "num_input_tokens_seen": 1094576, |
| "step": 8475 |
| }, |
| { |
| "epoch": 7.917833800186742, |
| "grad_norm": 3.336912155151367, |
| "learning_rate": 6.323019070338629e-06, |
| "loss": 0.0915, |
| "num_input_tokens_seen": 1095200, |
| "step": 8480 |
| }, |
| { |
| "epoch": 7.92250233426704, |
| "grad_norm": 0.2326798141002655, |
| "learning_rate": 6.29596214960792e-06, |
| "loss": 0.082, |
| "num_input_tokens_seen": 1095856, |
| "step": 8485 |
| }, |
| { |
| "epoch": 7.927170868347339, |
| "grad_norm": 0.3160500228404999, |
| "learning_rate": 6.2689549008112466e-06, |
| "loss": 0.1561, |
| "num_input_tokens_seen": 1096592, |
| "step": 8490 |
| }, |
| { |
| "epoch": 7.931839402427638, |
| "grad_norm": 0.7252269387245178, |
| "learning_rate": 6.241997395671209e-06, |
| "loss": 0.022, |
| "num_input_tokens_seen": 1097200, |
| "step": 8495 |
| }, |
| { |
| "epoch": 7.936507936507937, |
| "grad_norm": 5.833823204040527, |
| "learning_rate": 6.215089705778315e-06, |
| "loss": 0.0991, |
| "num_input_tokens_seen": 1097840, |
| "step": 8500 |
| }, |
| { |
| "epoch": 7.9411764705882355, |
| "grad_norm": 0.3958582282066345, |
| "learning_rate": 6.18823190259076e-06, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 1098432, |
| "step": 8505 |
| }, |
| { |
| "epoch": 7.945845004668534, |
| "grad_norm": 0.03990139067173004, |
| "learning_rate": 6.161424057434278e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1099136, |
| "step": 8510 |
| }, |
| { |
| "epoch": 7.950513538748833, |
| "grad_norm": 0.18915289640426636, |
| "learning_rate": 6.134666241501905e-06, |
| "loss": 0.2817, |
| "num_input_tokens_seen": 1099776, |
| "step": 8515 |
| }, |
| { |
| "epoch": 7.955182072829132, |
| "grad_norm": 1.3937067985534668, |
| "learning_rate": 6.107958525853838e-06, |
| "loss": 0.1055, |
| "num_input_tokens_seen": 1100464, |
| "step": 8520 |
| }, |
| { |
| "epoch": 7.959850606909431, |
| "grad_norm": 3.5351006984710693, |
| "learning_rate": 6.081300981417226e-06, |
| "loss": 0.0682, |
| "num_input_tokens_seen": 1101104, |
| "step": 8525 |
| }, |
| { |
| "epoch": 7.964519140989729, |
| "grad_norm": 3.100557804107666, |
| "learning_rate": 6.0546936789859505e-06, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 1101744, |
| "step": 8530 |
| }, |
| { |
| "epoch": 7.969187675070028, |
| "grad_norm": 1.3666666746139526, |
| "learning_rate": 6.028136689220498e-06, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 1102400, |
| "step": 8535 |
| }, |
| { |
| "epoch": 7.973856209150327, |
| "grad_norm": 5.621835231781006, |
| "learning_rate": 6.001630082647722e-06, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 1102992, |
| "step": 8540 |
| }, |
| { |
| "epoch": 7.978524743230626, |
| "grad_norm": 2.5733768939971924, |
| "learning_rate": 5.975173929660688e-06, |
| "loss": 0.1164, |
| "num_input_tokens_seen": 1103632, |
| "step": 8545 |
| }, |
| { |
| "epoch": 7.983193277310924, |
| "grad_norm": 3.054447650909424, |
| "learning_rate": 5.948768300518459e-06, |
| "loss": 0.1701, |
| "num_input_tokens_seen": 1104320, |
| "step": 8550 |
| }, |
| { |
| "epoch": 7.987861811391223, |
| "grad_norm": 4.3491058349609375, |
| "learning_rate": 5.922413265345922e-06, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 1104928, |
| "step": 8555 |
| }, |
| { |
| "epoch": 7.992530345471522, |
| "grad_norm": 11.961054801940918, |
| "learning_rate": 5.896108894133617e-06, |
| "loss": 0.1614, |
| "num_input_tokens_seen": 1105440, |
| "step": 8560 |
| }, |
| { |
| "epoch": 7.997198879551821, |
| "grad_norm": 11.302621841430664, |
| "learning_rate": 5.8698552567375275e-06, |
| "loss": 0.1423, |
| "num_input_tokens_seen": 1106112, |
| "step": 8565 |
| }, |
| { |
| "epoch": 8.00186741363212, |
| "grad_norm": 0.8292189240455627, |
| "learning_rate": 5.8436524228789145e-06, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 1106664, |
| "step": 8570 |
| }, |
| { |
| "epoch": 8.006535947712418, |
| "grad_norm": 1.8794206380844116, |
| "learning_rate": 5.817500462144099e-06, |
| "loss": 0.0452, |
| "num_input_tokens_seen": 1107368, |
| "step": 8575 |
| }, |
| { |
| "epoch": 8.007469654528478, |
| "eval_loss": 1.052361011505127, |
| "eval_runtime": 3.8664, |
| "eval_samples_per_second": 61.556, |
| "eval_steps_per_second": 30.778, |
| "num_input_tokens_seen": 1107480, |
| "step": 8576 |
| }, |
| { |
| "epoch": 8.011204481792717, |
| "grad_norm": 1.9271084070205688, |
| "learning_rate": 5.791399443984319e-06, |
| "loss": 0.2034, |
| "num_input_tokens_seen": 1107992, |
| "step": 8580 |
| }, |
| { |
| "epoch": 8.015873015873016, |
| "grad_norm": 3.432539939880371, |
| "learning_rate": 5.76534943771552e-06, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 1108664, |
| "step": 8585 |
| }, |
| { |
| "epoch": 8.020541549953315, |
| "grad_norm": 0.08454066514968872, |
| "learning_rate": 5.73935051251818e-06, |
| "loss": 0.0748, |
| "num_input_tokens_seen": 1109224, |
| "step": 8590 |
| }, |
| { |
| "epoch": 8.025210084033613, |
| "grad_norm": 1.4576191902160645, |
| "learning_rate": 5.71340273743711e-06, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 1109976, |
| "step": 8595 |
| }, |
| { |
| "epoch": 8.029878618113912, |
| "grad_norm": 6.439827919006348, |
| "learning_rate": 5.687506181381286e-06, |
| "loss": 0.0867, |
| "num_input_tokens_seen": 1110648, |
| "step": 8600 |
| }, |
| { |
| "epoch": 8.034547152194211, |
| "grad_norm": 1.4698748588562012, |
| "learning_rate": 5.661660913123673e-06, |
| "loss": 0.053, |
| "num_input_tokens_seen": 1111320, |
| "step": 8605 |
| }, |
| { |
| "epoch": 8.03921568627451, |
| "grad_norm": 11.151979446411133, |
| "learning_rate": 5.635867001301026e-06, |
| "loss": 0.081, |
| "num_input_tokens_seen": 1111976, |
| "step": 8610 |
| }, |
| { |
| "epoch": 8.043884220354808, |
| "grad_norm": 0.18296192586421967, |
| "learning_rate": 5.610124514413714e-06, |
| "loss": 0.0617, |
| "num_input_tokens_seen": 1112712, |
| "step": 8615 |
| }, |
| { |
| "epoch": 8.048552754435107, |
| "grad_norm": 0.12693344056606293, |
| "learning_rate": 5.584433520825541e-06, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 1113336, |
| "step": 8620 |
| }, |
| { |
| "epoch": 8.053221288515406, |
| "grad_norm": 1.491117238998413, |
| "learning_rate": 5.55879408876355e-06, |
| "loss": 0.03, |
| "num_input_tokens_seen": 1113960, |
| "step": 8625 |
| }, |
| { |
| "epoch": 8.057889822595705, |
| "grad_norm": 3.4405813217163086, |
| "learning_rate": 5.5332062863178685e-06, |
| "loss": 0.0393, |
| "num_input_tokens_seen": 1114600, |
| "step": 8630 |
| }, |
| { |
| "epoch": 8.062558356676004, |
| "grad_norm": 5.879638671875, |
| "learning_rate": 5.507670181441493e-06, |
| "loss": 0.1544, |
| "num_input_tokens_seen": 1115240, |
| "step": 8635 |
| }, |
| { |
| "epoch": 8.067226890756302, |
| "grad_norm": 8.053985595703125, |
| "learning_rate": 5.482185841950147e-06, |
| "loss": 0.0551, |
| "num_input_tokens_seen": 1115800, |
| "step": 8640 |
| }, |
| { |
| "epoch": 8.071895424836601, |
| "grad_norm": 5.818375587463379, |
| "learning_rate": 5.4567533355220804e-06, |
| "loss": 0.017, |
| "num_input_tokens_seen": 1116392, |
| "step": 8645 |
| }, |
| { |
| "epoch": 8.0765639589169, |
| "grad_norm": 0.13191650807857513, |
| "learning_rate": 5.43137272969787e-06, |
| "loss": 0.0075, |
| "num_input_tokens_seen": 1116984, |
| "step": 8650 |
| }, |
| { |
| "epoch": 8.081232492997199, |
| "grad_norm": 7.324196815490723, |
| "learning_rate": 5.406044091880285e-06, |
| "loss": 0.0568, |
| "num_input_tokens_seen": 1117544, |
| "step": 8655 |
| }, |
| { |
| "epoch": 8.085901027077497, |
| "grad_norm": 4.285675525665283, |
| "learning_rate": 5.380767489334076e-06, |
| "loss": 0.0771, |
| "num_input_tokens_seen": 1118248, |
| "step": 8660 |
| }, |
| { |
| "epoch": 8.090569561157796, |
| "grad_norm": 1.300607681274414, |
| "learning_rate": 5.3555429891858075e-06, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 1118888, |
| "step": 8665 |
| }, |
| { |
| "epoch": 8.095238095238095, |
| "grad_norm": 1.639029860496521, |
| "learning_rate": 5.330370658423661e-06, |
| "loss": 0.0412, |
| "num_input_tokens_seen": 1119560, |
| "step": 8670 |
| }, |
| { |
| "epoch": 8.099906629318394, |
| "grad_norm": 0.05550481006503105, |
| "learning_rate": 5.305250563897299e-06, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 1120184, |
| "step": 8675 |
| }, |
| { |
| "epoch": 8.104575163398692, |
| "grad_norm": 0.6807479858398438, |
| "learning_rate": 5.280182772317632e-06, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 1120888, |
| "step": 8680 |
| }, |
| { |
| "epoch": 8.109243697478991, |
| "grad_norm": 1.5354772806167603, |
| "learning_rate": 5.255167350256693e-06, |
| "loss": 0.0708, |
| "num_input_tokens_seen": 1121496, |
| "step": 8685 |
| }, |
| { |
| "epoch": 8.11391223155929, |
| "grad_norm": 4.521956443786621, |
| "learning_rate": 5.230204364147432e-06, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 1122120, |
| "step": 8690 |
| }, |
| { |
| "epoch": 8.118580765639589, |
| "grad_norm": 3.2341227531433105, |
| "learning_rate": 5.205293880283552e-06, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 1122792, |
| "step": 8695 |
| }, |
| { |
| "epoch": 8.123249299719888, |
| "grad_norm": 1.803482174873352, |
| "learning_rate": 5.180435964819303e-06, |
| "loss": 0.046, |
| "num_input_tokens_seen": 1123336, |
| "step": 8700 |
| }, |
| { |
| "epoch": 8.127917833800186, |
| "grad_norm": 1.5913814306259155, |
| "learning_rate": 5.155630683769358e-06, |
| "loss": 0.0512, |
| "num_input_tokens_seen": 1123960, |
| "step": 8705 |
| }, |
| { |
| "epoch": 8.132586367880485, |
| "grad_norm": 0.06489665806293488, |
| "learning_rate": 5.130878103008604e-06, |
| "loss": 0.062, |
| "num_input_tokens_seen": 1124648, |
| "step": 8710 |
| }, |
| { |
| "epoch": 8.137254901960784, |
| "grad_norm": 0.5183777809143066, |
| "learning_rate": 5.106178288271962e-06, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 1125352, |
| "step": 8715 |
| }, |
| { |
| "epoch": 8.141923436041083, |
| "grad_norm": 0.4233246147632599, |
| "learning_rate": 5.081531305154219e-06, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 1125976, |
| "step": 8720 |
| }, |
| { |
| "epoch": 8.146591970121381, |
| "grad_norm": 1.313720464706421, |
| "learning_rate": 5.056937219109881e-06, |
| "loss": 0.0077, |
| "num_input_tokens_seen": 1126600, |
| "step": 8725 |
| }, |
| { |
| "epoch": 8.15126050420168, |
| "grad_norm": 2.7932381629943848, |
| "learning_rate": 5.032396095452957e-06, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 1127240, |
| "step": 8730 |
| }, |
| { |
| "epoch": 8.155929038281979, |
| "grad_norm": 2.909801959991455, |
| "learning_rate": 5.007907999356814e-06, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 1127944, |
| "step": 8735 |
| }, |
| { |
| "epoch": 8.160597572362278, |
| "grad_norm": 1.8586024045944214, |
| "learning_rate": 4.9834729958540025e-06, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 1128632, |
| "step": 8740 |
| }, |
| { |
| "epoch": 8.165266106442576, |
| "grad_norm": 2.5531888008117676, |
| "learning_rate": 4.959091149836048e-06, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 1129256, |
| "step": 8745 |
| }, |
| { |
| "epoch": 8.169934640522875, |
| "grad_norm": 0.19135218858718872, |
| "learning_rate": 4.934762526053333e-06, |
| "loss": 0.0356, |
| "num_input_tokens_seen": 1129896, |
| "step": 8750 |
| }, |
| { |
| "epoch": 8.174603174603174, |
| "grad_norm": 0.020777558907866478, |
| "learning_rate": 4.910487189114893e-06, |
| "loss": 0.0827, |
| "num_input_tokens_seen": 1130568, |
| "step": 8755 |
| }, |
| { |
| "epoch": 8.179271708683473, |
| "grad_norm": 4.427436828613281, |
| "learning_rate": 4.886265203488241e-06, |
| "loss": 0.089, |
| "num_input_tokens_seen": 1131208, |
| "step": 8760 |
| }, |
| { |
| "epoch": 8.183940242763772, |
| "grad_norm": 0.16428785026073456, |
| "learning_rate": 4.862096633499225e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 1131848, |
| "step": 8765 |
| }, |
| { |
| "epoch": 8.18860877684407, |
| "grad_norm": 3.0339784622192383, |
| "learning_rate": 4.83798154333181e-06, |
| "loss": 0.0792, |
| "num_input_tokens_seen": 1132536, |
| "step": 8770 |
| }, |
| { |
| "epoch": 8.193277310924369, |
| "grad_norm": 0.03558497503399849, |
| "learning_rate": 4.81391999702796e-06, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 1133176, |
| "step": 8775 |
| }, |
| { |
| "epoch": 8.197945845004668, |
| "grad_norm": 0.018901566043496132, |
| "learning_rate": 4.789912058487436e-06, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 1133832, |
| "step": 8780 |
| }, |
| { |
| "epoch": 8.202614379084967, |
| "grad_norm": 1.9881067276000977, |
| "learning_rate": 4.765957791467635e-06, |
| "loss": 0.2102, |
| "num_input_tokens_seen": 1134408, |
| "step": 8785 |
| }, |
| { |
| "epoch": 8.207282913165265, |
| "grad_norm": 4.089415550231934, |
| "learning_rate": 4.7420572595834185e-06, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 1135128, |
| "step": 8790 |
| }, |
| { |
| "epoch": 8.211951447245564, |
| "grad_norm": 9.398062705993652, |
| "learning_rate": 4.7182105263069455e-06, |
| "loss": 0.0878, |
| "num_input_tokens_seen": 1135752, |
| "step": 8795 |
| }, |
| { |
| "epoch": 8.216619981325863, |
| "grad_norm": 3.531256914138794, |
| "learning_rate": 4.694417654967492e-06, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 1136440, |
| "step": 8800 |
| }, |
| { |
| "epoch": 8.221288515406162, |
| "grad_norm": 7.09257698059082, |
| "learning_rate": 4.670678708751311e-06, |
| "loss": 0.129, |
| "num_input_tokens_seen": 1137000, |
| "step": 8805 |
| }, |
| { |
| "epoch": 8.22595704948646, |
| "grad_norm": 0.060900285840034485, |
| "learning_rate": 4.646993750701439e-06, |
| "loss": 0.104, |
| "num_input_tokens_seen": 1137608, |
| "step": 8810 |
| }, |
| { |
| "epoch": 8.23062558356676, |
| "grad_norm": 0.01634305715560913, |
| "learning_rate": 4.623362843717549e-06, |
| "loss": 0.008, |
| "num_input_tokens_seen": 1138184, |
| "step": 8815 |
| }, |
| { |
| "epoch": 8.235294117647058, |
| "grad_norm": 14.870940208435059, |
| "learning_rate": 4.599786050555746e-06, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 1138776, |
| "step": 8820 |
| }, |
| { |
| "epoch": 8.239962651727357, |
| "grad_norm": 0.4319959282875061, |
| "learning_rate": 4.576263433828445e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1139448, |
| "step": 8825 |
| }, |
| { |
| "epoch": 8.244631185807656, |
| "grad_norm": 4.329498767852783, |
| "learning_rate": 4.552795056004194e-06, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 1140056, |
| "step": 8830 |
| }, |
| { |
| "epoch": 8.249299719887954, |
| "grad_norm": 4.342251777648926, |
| "learning_rate": 4.5293809794074744e-06, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 1140600, |
| "step": 8835 |
| }, |
| { |
| "epoch": 8.253968253968253, |
| "grad_norm": 6.068799018859863, |
| "learning_rate": 4.506021266218582e-06, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 1141224, |
| "step": 8840 |
| }, |
| { |
| "epoch": 8.258636788048554, |
| "grad_norm": 6.533751487731934, |
| "learning_rate": 4.482715978473428e-06, |
| "loss": 0.0266, |
| "num_input_tokens_seen": 1141896, |
| "step": 8845 |
| }, |
| { |
| "epoch": 8.263305322128852, |
| "grad_norm": 8.173686027526855, |
| "learning_rate": 4.459465178063396e-06, |
| "loss": 0.0754, |
| "num_input_tokens_seen": 1142536, |
| "step": 8850 |
| }, |
| { |
| "epoch": 8.267973856209151, |
| "grad_norm": 0.6834007501602173, |
| "learning_rate": 4.436268926735162e-06, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 1143112, |
| "step": 8855 |
| }, |
| { |
| "epoch": 8.27264239028945, |
| "grad_norm": 1.0048269033432007, |
| "learning_rate": 4.4131272860905455e-06, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 1143736, |
| "step": 8860 |
| }, |
| { |
| "epoch": 8.277310924369749, |
| "grad_norm": 1.5272879600524902, |
| "learning_rate": 4.390040317586336e-06, |
| "loss": 0.0805, |
| "num_input_tokens_seen": 1144456, |
| "step": 8865 |
| }, |
| { |
| "epoch": 8.281979458450047, |
| "grad_norm": 0.3423355221748352, |
| "learning_rate": 4.367008082534113e-06, |
| "loss": 0.03, |
| "num_input_tokens_seen": 1145208, |
| "step": 8870 |
| }, |
| { |
| "epoch": 8.286647992530346, |
| "grad_norm": 6.090906620025635, |
| "learning_rate": 4.344030642100133e-06, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 1145864, |
| "step": 8875 |
| }, |
| { |
| "epoch": 8.291316526610645, |
| "grad_norm": 3.7591450214385986, |
| "learning_rate": 4.321108057305101e-06, |
| "loss": 0.1285, |
| "num_input_tokens_seen": 1146504, |
| "step": 8880 |
| }, |
| { |
| "epoch": 8.295985060690944, |
| "grad_norm": 5.501720905303955, |
| "learning_rate": 4.298240389024077e-06, |
| "loss": 0.0518, |
| "num_input_tokens_seen": 1147016, |
| "step": 8885 |
| }, |
| { |
| "epoch": 8.300653594771243, |
| "grad_norm": 7.5667266845703125, |
| "learning_rate": 4.2754276979862536e-06, |
| "loss": 0.0339, |
| "num_input_tokens_seen": 1147624, |
| "step": 8890 |
| }, |
| { |
| "epoch": 8.305322128851541, |
| "grad_norm": 0.4266260862350464, |
| "learning_rate": 4.252670044774831e-06, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 1148168, |
| "step": 8895 |
| }, |
| { |
| "epoch": 8.30999066293184, |
| "grad_norm": 0.5307876467704773, |
| "learning_rate": 4.229967489826853e-06, |
| "loss": 0.0462, |
| "num_input_tokens_seen": 1148856, |
| "step": 8900 |
| }, |
| { |
| "epoch": 8.314659197012139, |
| "grad_norm": 6.373628616333008, |
| "learning_rate": 4.2073200934330315e-06, |
| "loss": 0.1144, |
| "num_input_tokens_seen": 1149496, |
| "step": 8905 |
| }, |
| { |
| "epoch": 8.319327731092438, |
| "grad_norm": 0.1435551941394806, |
| "learning_rate": 4.184727915737607e-06, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 1150328, |
| "step": 8910 |
| }, |
| { |
| "epoch": 8.323996265172736, |
| "grad_norm": 0.4174332916736603, |
| "learning_rate": 4.162191016738151e-06, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 1150920, |
| "step": 8915 |
| }, |
| { |
| "epoch": 8.328664799253035, |
| "grad_norm": 0.07925593107938766, |
| "learning_rate": 4.139709456285465e-06, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 1151512, |
| "step": 8920 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 4.719840049743652, |
| "learning_rate": 4.11728329408336e-06, |
| "loss": 0.0052, |
| "num_input_tokens_seen": 1152184, |
| "step": 8925 |
| }, |
| { |
| "epoch": 8.338001867413633, |
| "grad_norm": 0.5039065480232239, |
| "learning_rate": 4.094912589688546e-06, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 1152872, |
| "step": 8930 |
| }, |
| { |
| "epoch": 8.342670401493931, |
| "grad_norm": 0.1591583490371704, |
| "learning_rate": 4.072597402510455e-06, |
| "loss": 0.0094, |
| "num_input_tokens_seen": 1153592, |
| "step": 8935 |
| }, |
| { |
| "epoch": 8.34733893557423, |
| "grad_norm": 1.2247728109359741, |
| "learning_rate": 4.050337791811068e-06, |
| "loss": 0.1042, |
| "num_input_tokens_seen": 1154168, |
| "step": 8940 |
| }, |
| { |
| "epoch": 8.352007469654529, |
| "grad_norm": 0.2649737596511841, |
| "learning_rate": 4.0281338167047825e-06, |
| "loss": 0.0668, |
| "num_input_tokens_seen": 1154872, |
| "step": 8945 |
| }, |
| { |
| "epoch": 8.356676003734828, |
| "grad_norm": 0.05071331188082695, |
| "learning_rate": 4.005985536158246e-06, |
| "loss": 0.0744, |
| "num_input_tokens_seen": 1155576, |
| "step": 8950 |
| }, |
| { |
| "epoch": 8.361344537815127, |
| "grad_norm": 3.964921712875366, |
| "learning_rate": 3.983893008990208e-06, |
| "loss": 0.0497, |
| "num_input_tokens_seen": 1156184, |
| "step": 8955 |
| }, |
| { |
| "epoch": 8.366013071895425, |
| "grad_norm": 0.3039446771144867, |
| "learning_rate": 3.961856293871336e-06, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 1156792, |
| "step": 8960 |
| }, |
| { |
| "epoch": 8.370681605975724, |
| "grad_norm": 0.20161408185958862, |
| "learning_rate": 3.939875449324082e-06, |
| "loss": 0.1079, |
| "num_input_tokens_seen": 1157320, |
| "step": 8965 |
| }, |
| { |
| "epoch": 8.375350140056023, |
| "grad_norm": 0.36548763513565063, |
| "learning_rate": 3.917950533722534e-06, |
| "loss": 0.1091, |
| "num_input_tokens_seen": 1158024, |
| "step": 8970 |
| }, |
| { |
| "epoch": 8.380018674136322, |
| "grad_norm": 0.5267390608787537, |
| "learning_rate": 3.896081605292246e-06, |
| "loss": 0.0544, |
| "num_input_tokens_seen": 1158600, |
| "step": 8975 |
| }, |
| { |
| "epoch": 8.38468720821662, |
| "grad_norm": 0.9543160796165466, |
| "learning_rate": 3.874268722110089e-06, |
| "loss": 0.3445, |
| "num_input_tokens_seen": 1159192, |
| "step": 8980 |
| }, |
| { |
| "epoch": 8.38935574229692, |
| "grad_norm": 2.8803083896636963, |
| "learning_rate": 3.852511942104101e-06, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 1159800, |
| "step": 8985 |
| }, |
| { |
| "epoch": 8.394024276377218, |
| "grad_norm": 5.321974277496338, |
| "learning_rate": 3.83081132305331e-06, |
| "loss": 0.1096, |
| "num_input_tokens_seen": 1160376, |
| "step": 8990 |
| }, |
| { |
| "epoch": 8.398692810457517, |
| "grad_norm": 0.6705957055091858, |
| "learning_rate": 3.8091669225876176e-06, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 1161016, |
| "step": 8995 |
| }, |
| { |
| "epoch": 8.403361344537815, |
| "grad_norm": 3.0900211334228516, |
| "learning_rate": 3.7875787981876105e-06, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 1161752, |
| "step": 9000 |
| }, |
| { |
| "epoch": 8.408029878618114, |
| "grad_norm": 1.3996245861053467, |
| "learning_rate": 3.7660470071844393e-06, |
| "loss": 0.1057, |
| "num_input_tokens_seen": 1162328, |
| "step": 9005 |
| }, |
| { |
| "epoch": 8.412698412698413, |
| "grad_norm": 0.35684734582901, |
| "learning_rate": 3.7445716067596503e-06, |
| "loss": 0.0695, |
| "num_input_tokens_seen": 1163000, |
| "step": 9010 |
| }, |
| { |
| "epoch": 8.417366946778712, |
| "grad_norm": 7.436266899108887, |
| "learning_rate": 3.7231526539450167e-06, |
| "loss": 0.054, |
| "num_input_tokens_seen": 1163672, |
| "step": 9015 |
| }, |
| { |
| "epoch": 8.42203548085901, |
| "grad_norm": 0.4128905236721039, |
| "learning_rate": 3.701790205622421e-06, |
| "loss": 0.0206, |
| "num_input_tokens_seen": 1164344, |
| "step": 9020 |
| }, |
| { |
| "epoch": 8.42670401493931, |
| "grad_norm": 6.121613025665283, |
| "learning_rate": 3.6804843185236885e-06, |
| "loss": 0.0515, |
| "num_input_tokens_seen": 1164936, |
| "step": 9025 |
| }, |
| { |
| "epoch": 8.431372549019608, |
| "grad_norm": 0.07441362738609314, |
| "learning_rate": 3.6592350492304277e-06, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 1165496, |
| "step": 9030 |
| }, |
| { |
| "epoch": 8.436041083099907, |
| "grad_norm": 0.13714176416397095, |
| "learning_rate": 3.638042454173901e-06, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 1166088, |
| "step": 9035 |
| }, |
| { |
| "epoch": 8.440709617180206, |
| "grad_norm": 3.3369691371917725, |
| "learning_rate": 3.616906589634844e-06, |
| "loss": 0.0825, |
| "num_input_tokens_seen": 1166680, |
| "step": 9040 |
| }, |
| { |
| "epoch": 8.445378151260504, |
| "grad_norm": 0.21978700160980225, |
| "learning_rate": 3.595827511743341e-06, |
| "loss": 0.1878, |
| "num_input_tokens_seen": 1167320, |
| "step": 9045 |
| }, |
| { |
| "epoch": 8.450046685340803, |
| "grad_norm": 1.6111265420913696, |
| "learning_rate": 3.5748052764786737e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1167992, |
| "step": 9050 |
| }, |
| { |
| "epoch": 8.454715219421102, |
| "grad_norm": 6.312107563018799, |
| "learning_rate": 3.5538399396691707e-06, |
| "loss": 0.0709, |
| "num_input_tokens_seen": 1168600, |
| "step": 9055 |
| }, |
| { |
| "epoch": 8.4593837535014, |
| "grad_norm": 0.629169225692749, |
| "learning_rate": 3.5329315569920558e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1169192, |
| "step": 9060 |
| }, |
| { |
| "epoch": 8.4640522875817, |
| "grad_norm": 0.046670157462358475, |
| "learning_rate": 3.512080183973285e-06, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 1169992, |
| "step": 9065 |
| }, |
| { |
| "epoch": 8.468720821661998, |
| "grad_norm": 0.28610295057296753, |
| "learning_rate": 3.4912858759874295e-06, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 1170648, |
| "step": 9070 |
| }, |
| { |
| "epoch": 8.473389355742297, |
| "grad_norm": 0.05349961295723915, |
| "learning_rate": 3.470548688257522e-06, |
| "loss": 0.046, |
| "num_input_tokens_seen": 1171400, |
| "step": 9075 |
| }, |
| { |
| "epoch": 8.478057889822596, |
| "grad_norm": 12.51286506652832, |
| "learning_rate": 3.4498686758548784e-06, |
| "loss": 0.0764, |
| "num_input_tokens_seen": 1172120, |
| "step": 9080 |
| }, |
| { |
| "epoch": 8.482726423902895, |
| "grad_norm": 5.514620304107666, |
| "learning_rate": 3.4292458936989983e-06, |
| "loss": 0.0543, |
| "num_input_tokens_seen": 1172648, |
| "step": 9085 |
| }, |
| { |
| "epoch": 8.487394957983193, |
| "grad_norm": 1.6812564134597778, |
| "learning_rate": 3.408680396557376e-06, |
| "loss": 0.1259, |
| "num_input_tokens_seen": 1173320, |
| "step": 9090 |
| }, |
| { |
| "epoch": 8.492063492063492, |
| "grad_norm": 0.4996938407421112, |
| "learning_rate": 3.3881722390453923e-06, |
| "loss": 0.0785, |
| "num_input_tokens_seen": 1173928, |
| "step": 9095 |
| }, |
| { |
| "epoch": 8.49673202614379, |
| "grad_norm": 0.6753255724906921, |
| "learning_rate": 3.36772147562614e-06, |
| "loss": 0.07, |
| "num_input_tokens_seen": 1174632, |
| "step": 9100 |
| }, |
| { |
| "epoch": 8.50140056022409, |
| "grad_norm": 7.039217948913574, |
| "learning_rate": 3.3473281606103078e-06, |
| "loss": 0.0705, |
| "num_input_tokens_seen": 1175400, |
| "step": 9105 |
| }, |
| { |
| "epoch": 8.506069094304388, |
| "grad_norm": 6.246345520019531, |
| "learning_rate": 3.3269923481559966e-06, |
| "loss": 0.1948, |
| "num_input_tokens_seen": 1175960, |
| "step": 9110 |
| }, |
| { |
| "epoch": 8.507936507936508, |
| "eval_loss": 1.1914538145065308, |
| "eval_runtime": 3.8658, |
| "eval_samples_per_second": 61.565, |
| "eval_steps_per_second": 30.783, |
| "num_input_tokens_seen": 1176200, |
| "step": 9112 |
| }, |
| { |
| "epoch": 8.510737628384687, |
| "grad_norm": 2.0727744102478027, |
| "learning_rate": 3.3067140922686174e-06, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 1176584, |
| "step": 9115 |
| }, |
| { |
| "epoch": 8.515406162464986, |
| "grad_norm": 4.565818786621094, |
| "learning_rate": 3.286493446800723e-06, |
| "loss": 0.0851, |
| "num_input_tokens_seen": 1177272, |
| "step": 9120 |
| }, |
| { |
| "epoch": 8.520074696545285, |
| "grad_norm": 18.623031616210938, |
| "learning_rate": 3.2663304654518695e-06, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 1177864, |
| "step": 9125 |
| }, |
| { |
| "epoch": 8.524743230625583, |
| "grad_norm": 0.040356144309043884, |
| "learning_rate": 3.2462252017684797e-06, |
| "loss": 0.0861, |
| "num_input_tokens_seen": 1178504, |
| "step": 9130 |
| }, |
| { |
| "epoch": 8.529411764705882, |
| "grad_norm": 6.924943923950195, |
| "learning_rate": 3.2261777091436907e-06, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 1179224, |
| "step": 9135 |
| }, |
| { |
| "epoch": 8.534080298786181, |
| "grad_norm": 9.759965896606445, |
| "learning_rate": 3.2061880408172235e-06, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 1179816, |
| "step": 9140 |
| }, |
| { |
| "epoch": 8.53874883286648, |
| "grad_norm": 4.266696453094482, |
| "learning_rate": 3.1862562498752356e-06, |
| "loss": 0.1285, |
| "num_input_tokens_seen": 1180552, |
| "step": 9145 |
| }, |
| { |
| "epoch": 8.543417366946779, |
| "grad_norm": 3.827735424041748, |
| "learning_rate": 3.1663823892501803e-06, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 1181160, |
| "step": 9150 |
| }, |
| { |
| "epoch": 8.548085901027077, |
| "grad_norm": 17.82668113708496, |
| "learning_rate": 3.146566511720675e-06, |
| "loss": 0.1993, |
| "num_input_tokens_seen": 1181784, |
| "step": 9155 |
| }, |
| { |
| "epoch": 8.552754435107376, |
| "grad_norm": 0.9619858264923096, |
| "learning_rate": 3.1268086699113324e-06, |
| "loss": 0.0923, |
| "num_input_tokens_seen": 1182392, |
| "step": 9160 |
| }, |
| { |
| "epoch": 8.557422969187675, |
| "grad_norm": 0.9562932848930359, |
| "learning_rate": 3.1071089162926503e-06, |
| "loss": 0.0069, |
| "num_input_tokens_seen": 1183080, |
| "step": 9165 |
| }, |
| { |
| "epoch": 8.562091503267974, |
| "grad_norm": 7.583783149719238, |
| "learning_rate": 3.0874673031808713e-06, |
| "loss": 0.0645, |
| "num_input_tokens_seen": 1183752, |
| "step": 9170 |
| }, |
| { |
| "epoch": 8.566760037348272, |
| "grad_norm": 5.784301280975342, |
| "learning_rate": 3.0678838827378263e-06, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 1184344, |
| "step": 9175 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "grad_norm": 0.48377174139022827, |
| "learning_rate": 3.0483587069708165e-06, |
| "loss": 0.0085, |
| "num_input_tokens_seen": 1185000, |
| "step": 9180 |
| }, |
| { |
| "epoch": 8.57609710550887, |
| "grad_norm": 3.4420366287231445, |
| "learning_rate": 3.0288918277324413e-06, |
| "loss": 0.0735, |
| "num_input_tokens_seen": 1185624, |
| "step": 9185 |
| }, |
| { |
| "epoch": 8.580765639589169, |
| "grad_norm": 0.18768535554409027, |
| "learning_rate": 3.009483296720503e-06, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 1186280, |
| "step": 9190 |
| }, |
| { |
| "epoch": 8.585434173669467, |
| "grad_norm": 0.013358098454773426, |
| "learning_rate": 2.990133165477846e-06, |
| "loss": 0.1094, |
| "num_input_tokens_seen": 1186904, |
| "step": 9195 |
| }, |
| { |
| "epoch": 8.590102707749766, |
| "grad_norm": 3.7494723796844482, |
| "learning_rate": 2.970841485392223e-06, |
| "loss": 0.0293, |
| "num_input_tokens_seen": 1187544, |
| "step": 9200 |
| }, |
| { |
| "epoch": 8.594771241830065, |
| "grad_norm": 13.038296699523926, |
| "learning_rate": 2.9516083076961577e-06, |
| "loss": 0.2713, |
| "num_input_tokens_seen": 1188216, |
| "step": 9205 |
| }, |
| { |
| "epoch": 8.599439775910364, |
| "grad_norm": 0.1423317939043045, |
| "learning_rate": 2.932433683466801e-06, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 1188840, |
| "step": 9210 |
| }, |
| { |
| "epoch": 8.604108309990663, |
| "grad_norm": 5.753176689147949, |
| "learning_rate": 2.9133176636258196e-06, |
| "loss": 0.0274, |
| "num_input_tokens_seen": 1189512, |
| "step": 9215 |
| }, |
| { |
| "epoch": 8.608776844070961, |
| "grad_norm": 0.27099505066871643, |
| "learning_rate": 2.8942602989392386e-06, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 1190072, |
| "step": 9220 |
| }, |
| { |
| "epoch": 8.61344537815126, |
| "grad_norm": 0.08621136844158173, |
| "learning_rate": 2.8752616400173184e-06, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 1190760, |
| "step": 9225 |
| }, |
| { |
| "epoch": 8.618113912231559, |
| "grad_norm": 11.92264461517334, |
| "learning_rate": 2.856321737314413e-06, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 1191352, |
| "step": 9230 |
| }, |
| { |
| "epoch": 8.622782446311858, |
| "grad_norm": 2.3230783939361572, |
| "learning_rate": 2.83744064112883e-06, |
| "loss": 0.0859, |
| "num_input_tokens_seen": 1191928, |
| "step": 9235 |
| }, |
| { |
| "epoch": 8.627450980392156, |
| "grad_norm": 9.768386840820312, |
| "learning_rate": 2.8186184016027268e-06, |
| "loss": 0.1392, |
| "num_input_tokens_seen": 1192552, |
| "step": 9240 |
| }, |
| { |
| "epoch": 8.632119514472455, |
| "grad_norm": 14.89448356628418, |
| "learning_rate": 2.7998550687219267e-06, |
| "loss": 0.0993, |
| "num_input_tokens_seen": 1193272, |
| "step": 9245 |
| }, |
| { |
| "epoch": 8.636788048552754, |
| "grad_norm": 0.07315748929977417, |
| "learning_rate": 2.781150692315848e-06, |
| "loss": 0.0793, |
| "num_input_tokens_seen": 1193896, |
| "step": 9250 |
| }, |
| { |
| "epoch": 8.641456582633053, |
| "grad_norm": 5.044677257537842, |
| "learning_rate": 2.76250532205731e-06, |
| "loss": 0.045, |
| "num_input_tokens_seen": 1194616, |
| "step": 9255 |
| }, |
| { |
| "epoch": 8.646125116713351, |
| "grad_norm": 1.9704540967941284, |
| "learning_rate": 2.7439190074624505e-06, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 1195352, |
| "step": 9260 |
| }, |
| { |
| "epoch": 8.65079365079365, |
| "grad_norm": 3.324946403503418, |
| "learning_rate": 2.7253917978905696e-06, |
| "loss": 0.0753, |
| "num_input_tokens_seen": 1196056, |
| "step": 9265 |
| }, |
| { |
| "epoch": 8.655462184873949, |
| "grad_norm": 3.511345386505127, |
| "learning_rate": 2.706923742544001e-06, |
| "loss": 0.062, |
| "num_input_tokens_seen": 1196760, |
| "step": 9270 |
| }, |
| { |
| "epoch": 8.660130718954248, |
| "grad_norm": 0.5469305515289307, |
| "learning_rate": 2.6885148904679914e-06, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1197432, |
| "step": 9275 |
| }, |
| { |
| "epoch": 8.664799253034547, |
| "grad_norm": 0.09258583188056946, |
| "learning_rate": 2.6701652905505443e-06, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 1198056, |
| "step": 9280 |
| }, |
| { |
| "epoch": 8.669467787114845, |
| "grad_norm": 1.9686568975448608, |
| "learning_rate": 2.6518749915223296e-06, |
| "loss": 0.1191, |
| "num_input_tokens_seen": 1198696, |
| "step": 9285 |
| }, |
| { |
| "epoch": 8.674136321195144, |
| "grad_norm": 2.1283085346221924, |
| "learning_rate": 2.633644041956515e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 1199432, |
| "step": 9290 |
| }, |
| { |
| "epoch": 8.678804855275443, |
| "grad_norm": 0.5042722225189209, |
| "learning_rate": 2.6154724902686667e-06, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 1200024, |
| "step": 9295 |
| }, |
| { |
| "epoch": 8.683473389355742, |
| "grad_norm": 0.5686901211738586, |
| "learning_rate": 2.597360384716613e-06, |
| "loss": 0.055, |
| "num_input_tokens_seen": 1200664, |
| "step": 9300 |
| }, |
| { |
| "epoch": 8.68814192343604, |
| "grad_norm": 0.9969847798347473, |
| "learning_rate": 2.579307773400294e-06, |
| "loss": 0.0884, |
| "num_input_tokens_seen": 1201272, |
| "step": 9305 |
| }, |
| { |
| "epoch": 8.69281045751634, |
| "grad_norm": 0.07653947919607162, |
| "learning_rate": 2.561314704261669e-06, |
| "loss": 0.0702, |
| "num_input_tokens_seen": 1201880, |
| "step": 9310 |
| }, |
| { |
| "epoch": 8.697478991596638, |
| "grad_norm": 2.5944416522979736, |
| "learning_rate": 2.543381225084568e-06, |
| "loss": 0.0872, |
| "num_input_tokens_seen": 1202520, |
| "step": 9315 |
| }, |
| { |
| "epoch": 8.702147525676937, |
| "grad_norm": 4.421728134155273, |
| "learning_rate": 2.5255073834945715e-06, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 1203176, |
| "step": 9320 |
| }, |
| { |
| "epoch": 8.706816059757235, |
| "grad_norm": 1.0446518659591675, |
| "learning_rate": 2.507693226958871e-06, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 1203784, |
| "step": 9325 |
| }, |
| { |
| "epoch": 8.711484593837534, |
| "grad_norm": 9.855815887451172, |
| "learning_rate": 2.4899388027861626e-06, |
| "loss": 0.1938, |
| "num_input_tokens_seen": 1204440, |
| "step": 9330 |
| }, |
| { |
| "epoch": 8.716153127917833, |
| "grad_norm": 0.242303729057312, |
| "learning_rate": 2.472244158126508e-06, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 1205144, |
| "step": 9335 |
| }, |
| { |
| "epoch": 8.720821661998132, |
| "grad_norm": 0.00873336661607027, |
| "learning_rate": 2.45460933997122e-06, |
| "loss": 0.0911, |
| "num_input_tokens_seen": 1205816, |
| "step": 9340 |
| }, |
| { |
| "epoch": 8.72549019607843, |
| "grad_norm": 2.0310895442962646, |
| "learning_rate": 2.437034395152729e-06, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 1206472, |
| "step": 9345 |
| }, |
| { |
| "epoch": 8.73015873015873, |
| "grad_norm": 1.3347821235656738, |
| "learning_rate": 2.4195193703444587e-06, |
| "loss": 0.1385, |
| "num_input_tokens_seen": 1207160, |
| "step": 9350 |
| }, |
| { |
| "epoch": 8.73482726423903, |
| "grad_norm": 1.2632123231887817, |
| "learning_rate": 2.4020643120607034e-06, |
| "loss": 0.0297, |
| "num_input_tokens_seen": 1207784, |
| "step": 9355 |
| }, |
| { |
| "epoch": 8.739495798319329, |
| "grad_norm": 0.7416768670082092, |
| "learning_rate": 2.3846692666565055e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1208392, |
| "step": 9360 |
| }, |
| { |
| "epoch": 8.744164332399627, |
| "grad_norm": 1.5258066654205322, |
| "learning_rate": 2.3673342803275434e-06, |
| "loss": 0.0356, |
| "num_input_tokens_seen": 1209048, |
| "step": 9365 |
| }, |
| { |
| "epoch": 8.748832866479926, |
| "grad_norm": 0.5528998970985413, |
| "learning_rate": 2.3500593991099774e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 1209752, |
| "step": 9370 |
| }, |
| { |
| "epoch": 8.753501400560225, |
| "grad_norm": 3.4063894748687744, |
| "learning_rate": 2.3328446688803685e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1210376, |
| "step": 9375 |
| }, |
| { |
| "epoch": 8.758169934640524, |
| "grad_norm": 0.1745508909225464, |
| "learning_rate": 2.3156901353555167e-06, |
| "loss": 0.0388, |
| "num_input_tokens_seen": 1211160, |
| "step": 9380 |
| }, |
| { |
| "epoch": 8.762838468720823, |
| "grad_norm": 2.1299688816070557, |
| "learning_rate": 2.298595844092377e-06, |
| "loss": 0.1209, |
| "num_input_tokens_seen": 1211784, |
| "step": 9385 |
| }, |
| { |
| "epoch": 8.767507002801121, |
| "grad_norm": 3.634575128555298, |
| "learning_rate": 2.2815618404879087e-06, |
| "loss": 0.0841, |
| "num_input_tokens_seen": 1212456, |
| "step": 9390 |
| }, |
| { |
| "epoch": 8.77217553688142, |
| "grad_norm": 6.291904926300049, |
| "learning_rate": 2.2645881697789697e-06, |
| "loss": 0.1058, |
| "num_input_tokens_seen": 1213032, |
| "step": 9395 |
| }, |
| { |
| "epoch": 8.776844070961719, |
| "grad_norm": 4.1127095222473145, |
| "learning_rate": 2.2476748770421995e-06, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 1213800, |
| "step": 9400 |
| }, |
| { |
| "epoch": 8.781512605042018, |
| "grad_norm": 3.1000053882598877, |
| "learning_rate": 2.2308220071938805e-06, |
| "loss": 0.0455, |
| "num_input_tokens_seen": 1214424, |
| "step": 9405 |
| }, |
| { |
| "epoch": 8.786181139122316, |
| "grad_norm": 2.81400465965271, |
| "learning_rate": 2.214029604989834e-06, |
| "loss": 0.0553, |
| "num_input_tokens_seen": 1215016, |
| "step": 9410 |
| }, |
| { |
| "epoch": 8.790849673202615, |
| "grad_norm": 0.10587247461080551, |
| "learning_rate": 2.1972977150253064e-06, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 1215672, |
| "step": 9415 |
| }, |
| { |
| "epoch": 8.795518207282914, |
| "grad_norm": 0.8938270211219788, |
| "learning_rate": 2.1806263817348432e-06, |
| "loss": 0.0609, |
| "num_input_tokens_seen": 1216328, |
| "step": 9420 |
| }, |
| { |
| "epoch": 8.800186741363213, |
| "grad_norm": 0.26707565784454346, |
| "learning_rate": 2.1640156493921566e-06, |
| "loss": 0.1118, |
| "num_input_tokens_seen": 1217032, |
| "step": 9425 |
| }, |
| { |
| "epoch": 8.804855275443511, |
| "grad_norm": 0.22385770082473755, |
| "learning_rate": 2.1474655621100347e-06, |
| "loss": 0.1078, |
| "num_input_tokens_seen": 1217784, |
| "step": 9430 |
| }, |
| { |
| "epoch": 8.80952380952381, |
| "grad_norm": 0.3320285379886627, |
| "learning_rate": 2.130976163840212e-06, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 1218536, |
| "step": 9435 |
| }, |
| { |
| "epoch": 8.814192343604109, |
| "grad_norm": 4.328696250915527, |
| "learning_rate": 2.1145474983732484e-06, |
| "loss": 0.0654, |
| "num_input_tokens_seen": 1219128, |
| "step": 9440 |
| }, |
| { |
| "epoch": 8.818860877684408, |
| "grad_norm": 1.3271536827087402, |
| "learning_rate": 2.0981796093384216e-06, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 1219720, |
| "step": 9445 |
| }, |
| { |
| "epoch": 8.823529411764707, |
| "grad_norm": 5.28490686416626, |
| "learning_rate": 2.0818725402035944e-06, |
| "loss": 0.0571, |
| "num_input_tokens_seen": 1220344, |
| "step": 9450 |
| }, |
| { |
| "epoch": 8.828197945845005, |
| "grad_norm": 3.1943273544311523, |
| "learning_rate": 2.06562633427512e-06, |
| "loss": 0.1072, |
| "num_input_tokens_seen": 1221096, |
| "step": 9455 |
| }, |
| { |
| "epoch": 8.832866479925304, |
| "grad_norm": 0.32759732007980347, |
| "learning_rate": 2.0494410346977216e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 1221720, |
| "step": 9460 |
| }, |
| { |
| "epoch": 8.837535014005603, |
| "grad_norm": 0.14161980152130127, |
| "learning_rate": 2.03331668445437e-06, |
| "loss": 0.0841, |
| "num_input_tokens_seen": 1222312, |
| "step": 9465 |
| }, |
| { |
| "epoch": 8.842203548085902, |
| "grad_norm": 0.1818762719631195, |
| "learning_rate": 2.017253326366181e-06, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 1222920, |
| "step": 9470 |
| }, |
| { |
| "epoch": 8.8468720821662, |
| "grad_norm": 3.802173614501953, |
| "learning_rate": 2.0012510030922775e-06, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 1223480, |
| "step": 9475 |
| }, |
| { |
| "epoch": 8.8515406162465, |
| "grad_norm": 1.9173213243484497, |
| "learning_rate": 1.985309757129711e-06, |
| "loss": 0.0463, |
| "num_input_tokens_seen": 1224056, |
| "step": 9480 |
| }, |
| { |
| "epoch": 8.856209150326798, |
| "grad_norm": 0.11994439363479614, |
| "learning_rate": 1.9694296308133298e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 1224680, |
| "step": 9485 |
| }, |
| { |
| "epoch": 8.860877684407097, |
| "grad_norm": 2.666987180709839, |
| "learning_rate": 1.9536106663156555e-06, |
| "loss": 0.0056, |
| "num_input_tokens_seen": 1225368, |
| "step": 9490 |
| }, |
| { |
| "epoch": 8.865546218487395, |
| "grad_norm": 5.9014410972595215, |
| "learning_rate": 1.9378529056467976e-06, |
| "loss": 0.0321, |
| "num_input_tokens_seen": 1226120, |
| "step": 9495 |
| }, |
| { |
| "epoch": 8.870214752567694, |
| "grad_norm": 1.6379077434539795, |
| "learning_rate": 1.9221563906543143e-06, |
| "loss": 0.0841, |
| "num_input_tokens_seen": 1226856, |
| "step": 9500 |
| }, |
| { |
| "epoch": 8.874883286647993, |
| "grad_norm": 0.10031027346849442, |
| "learning_rate": 1.9065211630231283e-06, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 1227480, |
| "step": 9505 |
| }, |
| { |
| "epoch": 8.879551820728292, |
| "grad_norm": 5.307277202606201, |
| "learning_rate": 1.8909472642753917e-06, |
| "loss": 0.1524, |
| "num_input_tokens_seen": 1228120, |
| "step": 9510 |
| }, |
| { |
| "epoch": 8.88422035480859, |
| "grad_norm": 0.08876470476388931, |
| "learning_rate": 1.8754347357703955e-06, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 1228744, |
| "step": 9515 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 0.21213771402835846, |
| "learning_rate": 1.85998361870445e-06, |
| "loss": 0.0057, |
| "num_input_tokens_seen": 1229336, |
| "step": 9520 |
| }, |
| { |
| "epoch": 8.893557422969188, |
| "grad_norm": 0.10798242688179016, |
| "learning_rate": 1.8445939541107654e-06, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 1229912, |
| "step": 9525 |
| }, |
| { |
| "epoch": 8.898225957049487, |
| "grad_norm": 0.17130322754383087, |
| "learning_rate": 1.8292657828593712e-06, |
| "loss": 0.0869, |
| "num_input_tokens_seen": 1230648, |
| "step": 9530 |
| }, |
| { |
| "epoch": 8.902894491129786, |
| "grad_norm": 0.165962815284729, |
| "learning_rate": 1.8139991456569694e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 1231400, |
| "step": 9535 |
| }, |
| { |
| "epoch": 8.907563025210084, |
| "grad_norm": 1.7331267595291138, |
| "learning_rate": 1.7987940830468675e-06, |
| "loss": 0.0493, |
| "num_input_tokens_seen": 1232104, |
| "step": 9540 |
| }, |
| { |
| "epoch": 8.912231559290383, |
| "grad_norm": 0.1018749475479126, |
| "learning_rate": 1.7836506354088428e-06, |
| "loss": 0.1381, |
| "num_input_tokens_seen": 1232776, |
| "step": 9545 |
| }, |
| { |
| "epoch": 8.916900093370682, |
| "grad_norm": 0.2601299583911896, |
| "learning_rate": 1.768568842959037e-06, |
| "loss": 0.0917, |
| "num_input_tokens_seen": 1233352, |
| "step": 9550 |
| }, |
| { |
| "epoch": 8.92156862745098, |
| "grad_norm": 7.419183254241943, |
| "learning_rate": 1.7535487457498583e-06, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 1234008, |
| "step": 9555 |
| }, |
| { |
| "epoch": 8.92623716153128, |
| "grad_norm": 4.737880706787109, |
| "learning_rate": 1.7385903836698802e-06, |
| "loss": 0.0894, |
| "num_input_tokens_seen": 1234664, |
| "step": 9560 |
| }, |
| { |
| "epoch": 8.930905695611578, |
| "grad_norm": 4.031538009643555, |
| "learning_rate": 1.723693796443726e-06, |
| "loss": 0.1108, |
| "num_input_tokens_seen": 1235272, |
| "step": 9565 |
| }, |
| { |
| "epoch": 8.935574229691877, |
| "grad_norm": 4.560408115386963, |
| "learning_rate": 1.7088590236319507e-06, |
| "loss": 0.1927, |
| "num_input_tokens_seen": 1235864, |
| "step": 9570 |
| }, |
| { |
| "epoch": 8.940242763772176, |
| "grad_norm": 4.4047088623046875, |
| "learning_rate": 1.6940861046309625e-06, |
| "loss": 0.0726, |
| "num_input_tokens_seen": 1236520, |
| "step": 9575 |
| }, |
| { |
| "epoch": 8.944911297852475, |
| "grad_norm": 0.03612064942717552, |
| "learning_rate": 1.6793750786729012e-06, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 1237240, |
| "step": 9580 |
| }, |
| { |
| "epoch": 8.949579831932773, |
| "grad_norm": 1.356696605682373, |
| "learning_rate": 1.664725984825541e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 1237848, |
| "step": 9585 |
| }, |
| { |
| "epoch": 8.954248366013072, |
| "grad_norm": 1.6855072975158691, |
| "learning_rate": 1.650138861992187e-06, |
| "loss": 0.0804, |
| "num_input_tokens_seen": 1238456, |
| "step": 9590 |
| }, |
| { |
| "epoch": 8.95891690009337, |
| "grad_norm": 7.222161293029785, |
| "learning_rate": 1.6356137489115658e-06, |
| "loss": 0.0405, |
| "num_input_tokens_seen": 1239064, |
| "step": 9595 |
| }, |
| { |
| "epoch": 8.96358543417367, |
| "grad_norm": 0.48260605335235596, |
| "learning_rate": 1.6211506841577185e-06, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 1239720, |
| "step": 9600 |
| }, |
| { |
| "epoch": 8.968253968253968, |
| "grad_norm": 0.33404096961021423, |
| "learning_rate": 1.6067497061399179e-06, |
| "loss": 0.0085, |
| "num_input_tokens_seen": 1240376, |
| "step": 9605 |
| }, |
| { |
| "epoch": 8.972922502334267, |
| "grad_norm": 0.10782495886087418, |
| "learning_rate": 1.592410853102555e-06, |
| "loss": 0.0324, |
| "num_input_tokens_seen": 1241000, |
| "step": 9610 |
| }, |
| { |
| "epoch": 8.977591036414566, |
| "grad_norm": 6.687010288238525, |
| "learning_rate": 1.5781341631250224e-06, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 1241704, |
| "step": 9615 |
| }, |
| { |
| "epoch": 8.982259570494865, |
| "grad_norm": 5.4267659187316895, |
| "learning_rate": 1.563919674121636e-06, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 1242312, |
| "step": 9620 |
| }, |
| { |
| "epoch": 8.986928104575163, |
| "grad_norm": 0.3334493637084961, |
| "learning_rate": 1.5497674238415277e-06, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 1242888, |
| "step": 9625 |
| }, |
| { |
| "epoch": 8.991596638655462, |
| "grad_norm": 12.44144344329834, |
| "learning_rate": 1.5356774498685417e-06, |
| "loss": 0.1086, |
| "num_input_tokens_seen": 1243480, |
| "step": 9630 |
| }, |
| { |
| "epoch": 8.996265172735761, |
| "grad_norm": 1.71649169921875, |
| "learning_rate": 1.521649789621138e-06, |
| "loss": 0.02, |
| "num_input_tokens_seen": 1244120, |
| "step": 9635 |
| }, |
| { |
| "epoch": 9.00093370681606, |
| "grad_norm": 0.35185933113098145, |
| "learning_rate": 1.5076844803522922e-06, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 1244768, |
| "step": 9640 |
| }, |
| { |
| "epoch": 9.005602240896359, |
| "grad_norm": 2.0746331214904785, |
| "learning_rate": 1.4937815591493848e-06, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 1245392, |
| "step": 9645 |
| }, |
| { |
| "epoch": 9.008403361344538, |
| "eval_loss": 1.1851145029067993, |
| "eval_runtime": 3.8629, |
| "eval_samples_per_second": 61.611, |
| "eval_steps_per_second": 30.806, |
| "num_input_tokens_seen": 1245744, |
| "step": 9648 |
| }, |
| { |
| "epoch": 9.010270774976657, |
| "grad_norm": 2.7271130084991455, |
| "learning_rate": 1.4799410629341315e-06, |
| "loss": 0.0366, |
| "num_input_tokens_seen": 1245968, |
| "step": 9650 |
| }, |
| { |
| "epoch": 9.014939309056956, |
| "grad_norm": 0.29737892746925354, |
| "learning_rate": 1.4661630284624444e-06, |
| "loss": 0.0343, |
| "num_input_tokens_seen": 1246640, |
| "step": 9655 |
| }, |
| { |
| "epoch": 9.019607843137255, |
| "grad_norm": 0.14246688783168793, |
| "learning_rate": 1.4524474923243825e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 1247232, |
| "step": 9660 |
| }, |
| { |
| "epoch": 9.024276377217554, |
| "grad_norm": 0.5064313411712646, |
| "learning_rate": 1.438794490944012e-06, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 1247872, |
| "step": 9665 |
| }, |
| { |
| "epoch": 9.028944911297852, |
| "grad_norm": 7.1344404220581055, |
| "learning_rate": 1.4252040605793327e-06, |
| "loss": 0.1093, |
| "num_input_tokens_seen": 1248528, |
| "step": 9670 |
| }, |
| { |
| "epoch": 9.033613445378151, |
| "grad_norm": 2.0987865924835205, |
| "learning_rate": 1.411676237322171e-06, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 1249216, |
| "step": 9675 |
| }, |
| { |
| "epoch": 9.03828197945845, |
| "grad_norm": 0.6497586369514465, |
| "learning_rate": 1.3982110570980978e-06, |
| "loss": 0.0099, |
| "num_input_tokens_seen": 1249824, |
| "step": 9680 |
| }, |
| { |
| "epoch": 9.042950513538749, |
| "grad_norm": 2.457868814468384, |
| "learning_rate": 1.3848085556663198e-06, |
| "loss": 0.0787, |
| "num_input_tokens_seen": 1250512, |
| "step": 9685 |
| }, |
| { |
| "epoch": 9.047619047619047, |
| "grad_norm": 0.06839052587747574, |
| "learning_rate": 1.3714687686195827e-06, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 1251248, |
| "step": 9690 |
| }, |
| { |
| "epoch": 9.052287581699346, |
| "grad_norm": 0.15281909704208374, |
| "learning_rate": 1.358191731384098e-06, |
| "loss": 0.0291, |
| "num_input_tokens_seen": 1251952, |
| "step": 9695 |
| }, |
| { |
| "epoch": 9.056956115779645, |
| "grad_norm": 1.8181029558181763, |
| "learning_rate": 1.3449774792194114e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 1252624, |
| "step": 9700 |
| }, |
| { |
| "epoch": 9.061624649859944, |
| "grad_norm": 2.0059690475463867, |
| "learning_rate": 1.3318260472183514e-06, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 1253344, |
| "step": 9705 |
| }, |
| { |
| "epoch": 9.066293183940243, |
| "grad_norm": 0.31657326221466064, |
| "learning_rate": 1.3187374703069105e-06, |
| "loss": 0.0784, |
| "num_input_tokens_seen": 1253920, |
| "step": 9710 |
| }, |
| { |
| "epoch": 9.070961718020541, |
| "grad_norm": 1.749794840812683, |
| "learning_rate": 1.3057117832441567e-06, |
| "loss": 0.0428, |
| "num_input_tokens_seen": 1254560, |
| "step": 9715 |
| }, |
| { |
| "epoch": 9.07563025210084, |
| "grad_norm": 3.9665474891662598, |
| "learning_rate": 1.2927490206221388e-06, |
| "loss": 0.0354, |
| "num_input_tokens_seen": 1255168, |
| "step": 9720 |
| }, |
| { |
| "epoch": 9.080298786181139, |
| "grad_norm": 0.1431313157081604, |
| "learning_rate": 1.2798492168658083e-06, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 1255888, |
| "step": 9725 |
| }, |
| { |
| "epoch": 9.084967320261438, |
| "grad_norm": 5.252475738525391, |
| "learning_rate": 1.267012406232909e-06, |
| "loss": 0.097, |
| "num_input_tokens_seen": 1256464, |
| "step": 9730 |
| }, |
| { |
| "epoch": 9.089635854341736, |
| "grad_norm": 0.1456415206193924, |
| "learning_rate": 1.2542386228138997e-06, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 1257120, |
| "step": 9735 |
| }, |
| { |
| "epoch": 9.094304388422035, |
| "grad_norm": 0.6008406281471252, |
| "learning_rate": 1.241527900531858e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 1257776, |
| "step": 9740 |
| }, |
| { |
| "epoch": 9.098972922502334, |
| "grad_norm": 0.1887224167585373, |
| "learning_rate": 1.2288802731423883e-06, |
| "loss": 0.0828, |
| "num_input_tokens_seen": 1258464, |
| "step": 9745 |
| }, |
| { |
| "epoch": 9.103641456582633, |
| "grad_norm": 12.766477584838867, |
| "learning_rate": 1.2162957742335418e-06, |
| "loss": 0.1032, |
| "num_input_tokens_seen": 1259088, |
| "step": 9750 |
| }, |
| { |
| "epoch": 9.108309990662931, |
| "grad_norm": 6.654147148132324, |
| "learning_rate": 1.203774437225716e-06, |
| "loss": 0.0534, |
| "num_input_tokens_seen": 1259696, |
| "step": 9755 |
| }, |
| { |
| "epoch": 9.11297852474323, |
| "grad_norm": 0.9072001576423645, |
| "learning_rate": 1.1913162953715695e-06, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 1260336, |
| "step": 9760 |
| }, |
| { |
| "epoch": 9.117647058823529, |
| "grad_norm": 0.38549622893333435, |
| "learning_rate": 1.1789213817559458e-06, |
| "loss": 0.2404, |
| "num_input_tokens_seen": 1260992, |
| "step": 9765 |
| }, |
| { |
| "epoch": 9.122315592903828, |
| "grad_norm": 1.1125404834747314, |
| "learning_rate": 1.1665897292957556e-06, |
| "loss": 0.0516, |
| "num_input_tokens_seen": 1261616, |
| "step": 9770 |
| }, |
| { |
| "epoch": 9.126984126984127, |
| "grad_norm": 0.5928016304969788, |
| "learning_rate": 1.154321370739922e-06, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 1262288, |
| "step": 9775 |
| }, |
| { |
| "epoch": 9.131652661064425, |
| "grad_norm": 0.19205573201179504, |
| "learning_rate": 1.1421163386692719e-06, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 1262944, |
| "step": 9780 |
| }, |
| { |
| "epoch": 9.136321195144724, |
| "grad_norm": 1.2952364683151245, |
| "learning_rate": 1.1299746654964721e-06, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 1263632, |
| "step": 9785 |
| }, |
| { |
| "epoch": 9.140989729225023, |
| "grad_norm": 0.17227812111377716, |
| "learning_rate": 1.117896383465905e-06, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 1264320, |
| "step": 9790 |
| }, |
| { |
| "epoch": 9.145658263305322, |
| "grad_norm": 0.37296009063720703, |
| "learning_rate": 1.1058815246536263e-06, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 1264928, |
| "step": 9795 |
| }, |
| { |
| "epoch": 9.15032679738562, |
| "grad_norm": 4.000223159790039, |
| "learning_rate": 1.0939301209672543e-06, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 1265456, |
| "step": 9800 |
| }, |
| { |
| "epoch": 9.15499533146592, |
| "grad_norm": 0.1993858367204666, |
| "learning_rate": 1.0820422041458834e-06, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 1266064, |
| "step": 9805 |
| }, |
| { |
| "epoch": 9.159663865546218, |
| "grad_norm": 1.7776328325271606, |
| "learning_rate": 1.070217805760021e-06, |
| "loss": 0.1402, |
| "num_input_tokens_seen": 1266640, |
| "step": 9810 |
| }, |
| { |
| "epoch": 9.164332399626517, |
| "grad_norm": 0.3360730707645416, |
| "learning_rate": 1.0584569572114789e-06, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 1267232, |
| "step": 9815 |
| }, |
| { |
| "epoch": 9.169000933706815, |
| "grad_norm": 4.538324356079102, |
| "learning_rate": 1.046759689733301e-06, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 1267824, |
| "step": 9820 |
| }, |
| { |
| "epoch": 9.173669467787114, |
| "grad_norm": 4.662381649017334, |
| "learning_rate": 1.0351260343896828e-06, |
| "loss": 0.077, |
| "num_input_tokens_seen": 1268464, |
| "step": 9825 |
| }, |
| { |
| "epoch": 9.178338001867413, |
| "grad_norm": 5.578917026519775, |
| "learning_rate": 1.0235560220758916e-06, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 1269120, |
| "step": 9830 |
| }, |
| { |
| "epoch": 9.183006535947712, |
| "grad_norm": 0.07224404066801071, |
| "learning_rate": 1.0120496835181764e-06, |
| "loss": 0.0034, |
| "num_input_tokens_seen": 1269792, |
| "step": 9835 |
| }, |
| { |
| "epoch": 9.18767507002801, |
| "grad_norm": 7.480001449584961, |
| "learning_rate": 1.0006070492736775e-06, |
| "loss": 0.1353, |
| "num_input_tokens_seen": 1270352, |
| "step": 9840 |
| }, |
| { |
| "epoch": 9.19234360410831, |
| "grad_norm": 0.06923279166221619, |
| "learning_rate": 9.892281497303757e-07, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1270896, |
| "step": 9845 |
| }, |
| { |
| "epoch": 9.197012138188608, |
| "grad_norm": 2.548983573913574, |
| "learning_rate": 9.77913015106982e-07, |
| "loss": 0.0086, |
| "num_input_tokens_seen": 1271520, |
| "step": 9850 |
| }, |
| { |
| "epoch": 9.201680672268907, |
| "grad_norm": 0.0795988142490387, |
| "learning_rate": 9.66661675452865e-07, |
| "loss": 0.0109, |
| "num_input_tokens_seen": 1272128, |
| "step": 9855 |
| }, |
| { |
| "epoch": 9.206349206349206, |
| "grad_norm": 0.8964292407035828, |
| "learning_rate": 9.554741606479845e-07, |
| "loss": 0.0683, |
| "num_input_tokens_seen": 1272896, |
| "step": 9860 |
| }, |
| { |
| "epoch": 9.211017740429504, |
| "grad_norm": 0.06357522308826447, |
| "learning_rate": 9.443505004027936e-07, |
| "loss": 0.071, |
| "num_input_tokens_seen": 1273584, |
| "step": 9865 |
| }, |
| { |
| "epoch": 9.215686274509803, |
| "grad_norm": 0.20794254541397095, |
| "learning_rate": 9.332907242581735e-07, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 1274288, |
| "step": 9870 |
| }, |
| { |
| "epoch": 9.220354808590102, |
| "grad_norm": 7.484773635864258, |
| "learning_rate": 9.222948615853433e-07, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 1274848, |
| "step": 9875 |
| }, |
| { |
| "epoch": 9.2250233426704, |
| "grad_norm": 4.599846839904785, |
| "learning_rate": 9.113629415857999e-07, |
| "loss": 0.1142, |
| "num_input_tokens_seen": 1275536, |
| "step": 9880 |
| }, |
| { |
| "epoch": 9.2296918767507, |
| "grad_norm": 0.17251808941364288, |
| "learning_rate": 9.004949932912177e-07, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 1276112, |
| "step": 9885 |
| }, |
| { |
| "epoch": 9.234360410830998, |
| "grad_norm": 0.1543770283460617, |
| "learning_rate": 8.896910455633844e-07, |
| "loss": 0.0196, |
| "num_input_tokens_seen": 1276704, |
| "step": 9890 |
| }, |
| { |
| "epoch": 9.239028944911297, |
| "grad_norm": 2.0087528228759766, |
| "learning_rate": 8.78951127094127e-07, |
| "loss": 0.1253, |
| "num_input_tokens_seen": 1277312, |
| "step": 9895 |
| }, |
| { |
| "epoch": 9.243697478991596, |
| "grad_norm": 0.400070458650589, |
| "learning_rate": 8.682752664052302e-07, |
| "loss": 0.0722, |
| "num_input_tokens_seen": 1277920, |
| "step": 9900 |
| }, |
| { |
| "epoch": 9.248366013071895, |
| "grad_norm": 0.5074196457862854, |
| "learning_rate": 8.576634918483567e-07, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 1278624, |
| "step": 9905 |
| }, |
| { |
| "epoch": 9.253034547152193, |
| "grad_norm": 0.04042162373661995, |
| "learning_rate": 8.47115831604986e-07, |
| "loss": 0.0647, |
| "num_input_tokens_seen": 1279248, |
| "step": 9910 |
| }, |
| { |
| "epoch": 9.257703081232492, |
| "grad_norm": 2.7458279132843018, |
| "learning_rate": 8.366323136863225e-07, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 1279904, |
| "step": 9915 |
| }, |
| { |
| "epoch": 9.262371615312793, |
| "grad_norm": 2.4689865112304688, |
| "learning_rate": 8.262129659332346e-07, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 1280496, |
| "step": 9920 |
| }, |
| { |
| "epoch": 9.267040149393091, |
| "grad_norm": 2.182461738586426, |
| "learning_rate": 8.15857816016169e-07, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 1281200, |
| "step": 9925 |
| }, |
| { |
| "epoch": 9.27170868347339, |
| "grad_norm": 1.7279038429260254, |
| "learning_rate": 8.055668914350916e-07, |
| "loss": 0.0492, |
| "num_input_tokens_seen": 1281776, |
| "step": 9930 |
| }, |
| { |
| "epoch": 9.276377217553689, |
| "grad_norm": 0.5922943949699402, |
| "learning_rate": 7.953402195193999e-07, |
| "loss": 0.0432, |
| "num_input_tokens_seen": 1282416, |
| "step": 9935 |
| }, |
| { |
| "epoch": 9.281045751633988, |
| "grad_norm": 0.8308282494544983, |
| "learning_rate": 7.851778274278576e-07, |
| "loss": 0.0086, |
| "num_input_tokens_seen": 1283088, |
| "step": 9940 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "grad_norm": 0.3864976167678833, |
| "learning_rate": 7.750797421485267e-07, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 1283664, |
| "step": 9945 |
| }, |
| { |
| "epoch": 9.290382819794585, |
| "grad_norm": 6.857438087463379, |
| "learning_rate": 7.650459904986834e-07, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 1284320, |
| "step": 9950 |
| }, |
| { |
| "epoch": 9.295051353874884, |
| "grad_norm": 10.91110610961914, |
| "learning_rate": 7.550765991247654e-07, |
| "loss": 0.1058, |
| "num_input_tokens_seen": 1284880, |
| "step": 9955 |
| }, |
| { |
| "epoch": 9.299719887955183, |
| "grad_norm": 3.3910160064697266, |
| "learning_rate": 7.451715945022752e-07, |
| "loss": 0.0478, |
| "num_input_tokens_seen": 1285456, |
| "step": 9960 |
| }, |
| { |
| "epoch": 9.304388422035482, |
| "grad_norm": 0.13573530316352844, |
| "learning_rate": 7.353310029357352e-07, |
| "loss": 0.0075, |
| "num_input_tokens_seen": 1286144, |
| "step": 9965 |
| }, |
| { |
| "epoch": 9.30905695611578, |
| "grad_norm": 0.33091700077056885, |
| "learning_rate": 7.255548505586074e-07, |
| "loss": 0.066, |
| "num_input_tokens_seen": 1286752, |
| "step": 9970 |
| }, |
| { |
| "epoch": 9.313725490196079, |
| "grad_norm": 0.1710100620985031, |
| "learning_rate": 7.158431633332241e-07, |
| "loss": 0.0434, |
| "num_input_tokens_seen": 1287440, |
| "step": 9975 |
| }, |
| { |
| "epoch": 9.318394024276378, |
| "grad_norm": 1.2184041738510132, |
| "learning_rate": 7.061959670507102e-07, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 1288048, |
| "step": 9980 |
| }, |
| { |
| "epoch": 9.323062558356677, |
| "grad_norm": 0.09887643158435822, |
| "learning_rate": 6.966132873309273e-07, |
| "loss": 0.0602, |
| "num_input_tokens_seen": 1288688, |
| "step": 9985 |
| }, |
| { |
| "epoch": 9.327731092436975, |
| "grad_norm": 0.12310995906591415, |
| "learning_rate": 6.870951496224076e-07, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 1289328, |
| "step": 9990 |
| }, |
| { |
| "epoch": 9.332399626517274, |
| "grad_norm": 0.08117198199033737, |
| "learning_rate": 6.776415792022789e-07, |
| "loss": 0.0431, |
| "num_input_tokens_seen": 1289936, |
| "step": 9995 |
| }, |
| { |
| "epoch": 9.337068160597573, |
| "grad_norm": 0.21514607965946198, |
| "learning_rate": 6.682526011761919e-07, |
| "loss": 0.003, |
| "num_input_tokens_seen": 1290656, |
| "step": 10000 |
| }, |
| { |
| "epoch": 9.341736694677872, |
| "grad_norm": 0.2976192831993103, |
| "learning_rate": 6.589282404782682e-07, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 1291184, |
| "step": 10005 |
| }, |
| { |
| "epoch": 9.34640522875817, |
| "grad_norm": 3.206972122192383, |
| "learning_rate": 6.496685218710219e-07, |
| "loss": 0.0448, |
| "num_input_tokens_seen": 1291808, |
| "step": 10010 |
| }, |
| { |
| "epoch": 9.35107376283847, |
| "grad_norm": 0.09685748815536499, |
| "learning_rate": 6.404734699453018e-07, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 1292464, |
| "step": 10015 |
| }, |
| { |
| "epoch": 9.355742296918768, |
| "grad_norm": 0.07187190651893616, |
| "learning_rate": 6.313431091202165e-07, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 1293072, |
| "step": 10020 |
| }, |
| { |
| "epoch": 9.360410830999067, |
| "grad_norm": 0.08758264780044556, |
| "learning_rate": 6.222774636430811e-07, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 1293648, |
| "step": 10025 |
| }, |
| { |
| "epoch": 9.365079365079366, |
| "grad_norm": 0.6055055856704712, |
| "learning_rate": 6.13276557589354e-07, |
| "loss": 0.061, |
| "num_input_tokens_seen": 1294224, |
| "step": 10030 |
| }, |
| { |
| "epoch": 9.369747899159664, |
| "grad_norm": 0.9085772037506104, |
| "learning_rate": 6.043404148625503e-07, |
| "loss": 0.0291, |
| "num_input_tokens_seen": 1294912, |
| "step": 10035 |
| }, |
| { |
| "epoch": 9.374416433239963, |
| "grad_norm": 3.352837324142456, |
| "learning_rate": 5.954690591942036e-07, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 1295616, |
| "step": 10040 |
| }, |
| { |
| "epoch": 9.379084967320262, |
| "grad_norm": 0.11360262334346771, |
| "learning_rate": 5.866625141437959e-07, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 1296256, |
| "step": 10045 |
| }, |
| { |
| "epoch": 9.38375350140056, |
| "grad_norm": 0.08144768327474594, |
| "learning_rate": 5.779208030986916e-07, |
| "loss": 0.1058, |
| "num_input_tokens_seen": 1296896, |
| "step": 10050 |
| }, |
| { |
| "epoch": 9.38842203548086, |
| "grad_norm": 0.12814949452877045, |
| "learning_rate": 5.692439492740759e-07, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 1297472, |
| "step": 10055 |
| }, |
| { |
| "epoch": 9.393090569561158, |
| "grad_norm": 0.12964555621147156, |
| "learning_rate": 5.606319757128914e-07, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 1297984, |
| "step": 10060 |
| }, |
| { |
| "epoch": 9.397759103641457, |
| "grad_norm": 0.6443309783935547, |
| "learning_rate": 5.520849052857768e-07, |
| "loss": 0.0085, |
| "num_input_tokens_seen": 1298592, |
| "step": 10065 |
| }, |
| { |
| "epoch": 9.402427637721756, |
| "grad_norm": 0.11005096882581711, |
| "learning_rate": 5.436027606910199e-07, |
| "loss": 0.002, |
| "num_input_tokens_seen": 1299216, |
| "step": 10070 |
| }, |
| { |
| "epoch": 9.407096171802054, |
| "grad_norm": 0.4922139644622803, |
| "learning_rate": 5.351855644544796e-07, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 1299808, |
| "step": 10075 |
| }, |
| { |
| "epoch": 9.411764705882353, |
| "grad_norm": 6.919455051422119, |
| "learning_rate": 5.26833338929536e-07, |
| "loss": 0.1592, |
| "num_input_tokens_seen": 1300416, |
| "step": 10080 |
| }, |
| { |
| "epoch": 9.416433239962652, |
| "grad_norm": 0.36009007692337036, |
| "learning_rate": 5.18546106297016e-07, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 1301200, |
| "step": 10085 |
| }, |
| { |
| "epoch": 9.42110177404295, |
| "grad_norm": 1.9589595794677734, |
| "learning_rate": 5.103238885651618e-07, |
| "loss": 0.1431, |
| "num_input_tokens_seen": 1301936, |
| "step": 10090 |
| }, |
| { |
| "epoch": 9.42577030812325, |
| "grad_norm": 0.031739529222249985, |
| "learning_rate": 5.021667075695541e-07, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 1302576, |
| "step": 10095 |
| }, |
| { |
| "epoch": 9.430438842203548, |
| "grad_norm": 0.13981761038303375, |
| "learning_rate": 4.940745849730421e-07, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1303104, |
| "step": 10100 |
| }, |
| { |
| "epoch": 9.435107376283847, |
| "grad_norm": 1.46657395362854, |
| "learning_rate": 4.860475422657218e-07, |
| "loss": 0.0052, |
| "num_input_tokens_seen": 1303712, |
| "step": 10105 |
| }, |
| { |
| "epoch": 9.439775910364146, |
| "grad_norm": 4.406033515930176, |
| "learning_rate": 4.780856007648437e-07, |
| "loss": 0.0946, |
| "num_input_tokens_seen": 1304320, |
| "step": 10110 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 0.8488879203796387, |
| "learning_rate": 4.701887816147721e-07, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1304960, |
| "step": 10115 |
| }, |
| { |
| "epoch": 9.449112978524743, |
| "grad_norm": 0.2057238519191742, |
| "learning_rate": 4.6235710578693135e-07, |
| "loss": 0.0745, |
| "num_input_tokens_seen": 1305632, |
| "step": 10120 |
| }, |
| { |
| "epoch": 9.453781512605042, |
| "grad_norm": 3.9333977699279785, |
| "learning_rate": 4.545905940797457e-07, |
| "loss": 0.0357, |
| "num_input_tokens_seen": 1306304, |
| "step": 10125 |
| }, |
| { |
| "epoch": 9.458450046685341, |
| "grad_norm": 0.4153412878513336, |
| "learning_rate": 4.468892671185831e-07, |
| "loss": 0.0199, |
| "num_input_tokens_seen": 1306960, |
| "step": 10130 |
| }, |
| { |
| "epoch": 9.46311858076564, |
| "grad_norm": 0.0644126683473587, |
| "learning_rate": 4.392531453556975e-07, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 1307728, |
| "step": 10135 |
| }, |
| { |
| "epoch": 9.467787114845938, |
| "grad_norm": 1.2335668802261353, |
| "learning_rate": 4.316822490701866e-07, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 1308288, |
| "step": 10140 |
| }, |
| { |
| "epoch": 9.472455648926237, |
| "grad_norm": 0.025622710585594177, |
| "learning_rate": 4.24176598367923e-07, |
| "loss": 0.051, |
| "num_input_tokens_seen": 1308912, |
| "step": 10145 |
| }, |
| { |
| "epoch": 9.477124183006536, |
| "grad_norm": 3.5109171867370605, |
| "learning_rate": 4.16736213181515e-07, |
| "loss": 0.0612, |
| "num_input_tokens_seen": 1309648, |
| "step": 10150 |
| }, |
| { |
| "epoch": 9.481792717086835, |
| "grad_norm": 0.17764407396316528, |
| "learning_rate": 4.0936111327024017e-07, |
| "loss": 0.034, |
| "num_input_tokens_seen": 1310240, |
| "step": 10155 |
| }, |
| { |
| "epoch": 9.486461251167134, |
| "grad_norm": 0.03151076287031174, |
| "learning_rate": 4.0205131822000087e-07, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 1310944, |
| "step": 10160 |
| }, |
| { |
| "epoch": 9.491129785247432, |
| "grad_norm": 0.14204958081245422, |
| "learning_rate": 3.948068474432715e-07, |
| "loss": 0.0507, |
| "num_input_tokens_seen": 1311600, |
| "step": 10165 |
| }, |
| { |
| "epoch": 9.495798319327731, |
| "grad_norm": 2.0150723457336426, |
| "learning_rate": 3.876277201790485e-07, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 1312160, |
| "step": 10170 |
| }, |
| { |
| "epoch": 9.50046685340803, |
| "grad_norm": 0.4125816226005554, |
| "learning_rate": 3.80513955492795e-07, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 1312976, |
| "step": 10175 |
| }, |
| { |
| "epoch": 9.505135387488329, |
| "grad_norm": 3.8984930515289307, |
| "learning_rate": 3.7346557227639077e-07, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 1313584, |
| "step": 10180 |
| }, |
| { |
| "epoch": 9.508870214752568, |
| "eval_loss": 1.2354722023010254, |
| "eval_runtime": 3.869, |
| "eval_samples_per_second": 61.515, |
| "eval_steps_per_second": 30.757, |
| "num_input_tokens_seen": 1314112, |
| "step": 10184 |
| }, |
| { |
| "epoch": 9.509803921568627, |
| "grad_norm": 0.03488253802061081, |
| "learning_rate": 3.6648258924807944e-07, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 1314272, |
| "step": 10185 |
| }, |
| { |
| "epoch": 9.514472455648926, |
| "grad_norm": 0.47239649295806885, |
| "learning_rate": 3.5956502495243517e-07, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 1315024, |
| "step": 10190 |
| }, |
| { |
| "epoch": 9.519140989729225, |
| "grad_norm": 0.2930271327495575, |
| "learning_rate": 3.5271289776028503e-07, |
| "loss": 0.3299, |
| "num_input_tokens_seen": 1315552, |
| "step": 10195 |
| }, |
| { |
| "epoch": 9.523809523809524, |
| "grad_norm": 10.508183479309082, |
| "learning_rate": 3.4592622586869517e-07, |
| "loss": 0.0492, |
| "num_input_tokens_seen": 1316192, |
| "step": 10200 |
| }, |
| { |
| "epoch": 9.528478057889822, |
| "grad_norm": 6.849645137786865, |
| "learning_rate": 3.3920502730088176e-07, |
| "loss": 0.0533, |
| "num_input_tokens_seen": 1316784, |
| "step": 10205 |
| }, |
| { |
| "epoch": 9.533146591970121, |
| "grad_norm": 1.7187350988388062, |
| "learning_rate": 3.3254931990620017e-07, |
| "loss": 0.038, |
| "num_input_tokens_seen": 1317344, |
| "step": 10210 |
| }, |
| { |
| "epoch": 9.53781512605042, |
| "grad_norm": 0.098175048828125, |
| "learning_rate": 3.2595912136007543e-07, |
| "loss": 0.052, |
| "num_input_tokens_seen": 1318048, |
| "step": 10215 |
| }, |
| { |
| "epoch": 9.542483660130719, |
| "grad_norm": 5.587791442871094, |
| "learning_rate": 3.1943444916396894e-07, |
| "loss": 0.0975, |
| "num_input_tokens_seen": 1318816, |
| "step": 10220 |
| }, |
| { |
| "epoch": 9.547152194211018, |
| "grad_norm": 0.12869498133659363, |
| "learning_rate": 3.129753206453201e-07, |
| "loss": 0.1018, |
| "num_input_tokens_seen": 1319456, |
| "step": 10225 |
| }, |
| { |
| "epoch": 9.551820728291316, |
| "grad_norm": 0.026377171277999878, |
| "learning_rate": 3.0658175295749656e-07, |
| "loss": 0.009, |
| "num_input_tokens_seen": 1320096, |
| "step": 10230 |
| }, |
| { |
| "epoch": 9.556489262371615, |
| "grad_norm": 1.9462381601333618, |
| "learning_rate": 3.002537630797747e-07, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 1320704, |
| "step": 10235 |
| }, |
| { |
| "epoch": 9.561157796451914, |
| "grad_norm": 0.13072355091571808, |
| "learning_rate": 2.9399136781726735e-07, |
| "loss": 0.0283, |
| "num_input_tokens_seen": 1321408, |
| "step": 10240 |
| }, |
| { |
| "epoch": 9.565826330532213, |
| "grad_norm": 0.4914097487926483, |
| "learning_rate": 2.877945838008905e-07, |
| "loss": 0.0606, |
| "num_input_tokens_seen": 1322048, |
| "step": 10245 |
| }, |
| { |
| "epoch": 9.570494864612511, |
| "grad_norm": 0.8193666338920593, |
| "learning_rate": 2.816634274873192e-07, |
| "loss": 0.0388, |
| "num_input_tokens_seen": 1322720, |
| "step": 10250 |
| }, |
| { |
| "epoch": 9.57516339869281, |
| "grad_norm": 3.4000868797302246, |
| "learning_rate": 2.7559791515893717e-07, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 1323376, |
| "step": 10255 |
| }, |
| { |
| "epoch": 9.579831932773109, |
| "grad_norm": 0.41794702410697937, |
| "learning_rate": 2.695980629238065e-07, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 1324000, |
| "step": 10260 |
| }, |
| { |
| "epoch": 9.584500466853408, |
| "grad_norm": 1.3658123016357422, |
| "learning_rate": 2.6366388671560936e-07, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 1324624, |
| "step": 10265 |
| }, |
| { |
| "epoch": 9.589169000933706, |
| "grad_norm": 1.1461269855499268, |
| "learning_rate": 2.5779540229361745e-07, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 1325264, |
| "step": 10270 |
| }, |
| { |
| "epoch": 9.593837535014005, |
| "grad_norm": 2.883629083633423, |
| "learning_rate": 2.5199262524265023e-07, |
| "loss": 0.01, |
| "num_input_tokens_seen": 1325840, |
| "step": 10275 |
| }, |
| { |
| "epoch": 9.598506069094304, |
| "grad_norm": 0.3690735399723053, |
| "learning_rate": 2.462555709730197e-07, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 1326432, |
| "step": 10280 |
| }, |
| { |
| "epoch": 9.603174603174603, |
| "grad_norm": 1.3626866340637207, |
| "learning_rate": 2.4058425472050785e-07, |
| "loss": 0.0555, |
| "num_input_tokens_seen": 1327072, |
| "step": 10285 |
| }, |
| { |
| "epoch": 9.607843137254902, |
| "grad_norm": 8.382253646850586, |
| "learning_rate": 2.3497869154631147e-07, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 1327744, |
| "step": 10290 |
| }, |
| { |
| "epoch": 9.6125116713352, |
| "grad_norm": 2.498363494873047, |
| "learning_rate": 2.2943889633701698e-07, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1328368, |
| "step": 10295 |
| }, |
| { |
| "epoch": 9.6171802054155, |
| "grad_norm": 2.3475382328033447, |
| "learning_rate": 2.239648838045394e-07, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 1329088, |
| "step": 10300 |
| }, |
| { |
| "epoch": 9.621848739495798, |
| "grad_norm": 0.2977651357650757, |
| "learning_rate": 2.1855666848610845e-07, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 1329760, |
| "step": 10305 |
| }, |
| { |
| "epoch": 9.626517273576097, |
| "grad_norm": 0.7848034501075745, |
| "learning_rate": 2.132142647442048e-07, |
| "loss": 0.043, |
| "num_input_tokens_seen": 1330416, |
| "step": 10310 |
| }, |
| { |
| "epoch": 9.631185807656395, |
| "grad_norm": 0.05007505044341087, |
| "learning_rate": 2.079376867665489e-07, |
| "loss": 0.1435, |
| "num_input_tokens_seen": 1331072, |
| "step": 10315 |
| }, |
| { |
| "epoch": 9.635854341736694, |
| "grad_norm": 2.539809226989746, |
| "learning_rate": 2.0272694856603991e-07, |
| "loss": 0.0857, |
| "num_input_tokens_seen": 1331792, |
| "step": 10320 |
| }, |
| { |
| "epoch": 9.640522875816993, |
| "grad_norm": 4.598158359527588, |
| "learning_rate": 1.975820639807252e-07, |
| "loss": 0.0913, |
| "num_input_tokens_seen": 1332448, |
| "step": 10325 |
| }, |
| { |
| "epoch": 9.645191409897292, |
| "grad_norm": 0.18733039498329163, |
| "learning_rate": 1.925030466737754e-07, |
| "loss": 0.174, |
| "num_input_tokens_seen": 1333072, |
| "step": 10330 |
| }, |
| { |
| "epoch": 9.64985994397759, |
| "grad_norm": 0.0705503523349762, |
| "learning_rate": 1.8748991013343152e-07, |
| "loss": 0.063, |
| "num_input_tokens_seen": 1333680, |
| "step": 10335 |
| }, |
| { |
| "epoch": 9.65452847805789, |
| "grad_norm": 0.14228954911231995, |
| "learning_rate": 1.8254266767298023e-07, |
| "loss": 0.0043, |
| "num_input_tokens_seen": 1334368, |
| "step": 10340 |
| }, |
| { |
| "epoch": 9.659197012138188, |
| "grad_norm": 0.3317956328392029, |
| "learning_rate": 1.7766133243071192e-07, |
| "loss": 0.0349, |
| "num_input_tokens_seen": 1335072, |
| "step": 10345 |
| }, |
| { |
| "epoch": 9.663865546218487, |
| "grad_norm": 5.647100925445557, |
| "learning_rate": 1.7284591736989042e-07, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 1335648, |
| "step": 10350 |
| }, |
| { |
| "epoch": 9.668534080298786, |
| "grad_norm": 3.1891496181488037, |
| "learning_rate": 1.6809643527871398e-07, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 1336368, |
| "step": 10355 |
| }, |
| { |
| "epoch": 9.673202614379084, |
| "grad_norm": 0.18910200893878937, |
| "learning_rate": 1.6341289877028486e-07, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 1336944, |
| "step": 10360 |
| }, |
| { |
| "epoch": 9.677871148459383, |
| "grad_norm": 0.21139132976531982, |
| "learning_rate": 1.5879532028258148e-07, |
| "loss": 0.006, |
| "num_input_tokens_seen": 1337584, |
| "step": 10365 |
| }, |
| { |
| "epoch": 9.682539682539682, |
| "grad_norm": 0.3320813775062561, |
| "learning_rate": 1.5424371207841127e-07, |
| "loss": 0.0732, |
| "num_input_tokens_seen": 1338272, |
| "step": 10370 |
| }, |
| { |
| "epoch": 9.68720821661998, |
| "grad_norm": 0.11079546809196472, |
| "learning_rate": 1.497580862453829e-07, |
| "loss": 0.0501, |
| "num_input_tokens_seen": 1338944, |
| "step": 10375 |
| }, |
| { |
| "epoch": 9.69187675070028, |
| "grad_norm": 0.1472056806087494, |
| "learning_rate": 1.453384546958869e-07, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 1339552, |
| "step": 10380 |
| }, |
| { |
| "epoch": 9.696545284780578, |
| "grad_norm": 8.740482330322266, |
| "learning_rate": 1.4098482916705126e-07, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 1340224, |
| "step": 10385 |
| }, |
| { |
| "epoch": 9.701213818860877, |
| "grad_norm": 3.24934458732605, |
| "learning_rate": 1.3669722122070516e-07, |
| "loss": 0.1227, |
| "num_input_tokens_seen": 1340768, |
| "step": 10390 |
| }, |
| { |
| "epoch": 9.705882352941176, |
| "grad_norm": 0.5039592385292053, |
| "learning_rate": 1.324756422433654e-07, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 1341456, |
| "step": 10395 |
| }, |
| { |
| "epoch": 9.710550887021475, |
| "grad_norm": 5.067893981933594, |
| "learning_rate": 1.283201034461917e-07, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 1342096, |
| "step": 10400 |
| }, |
| { |
| "epoch": 9.715219421101773, |
| "grad_norm": 0.04172622784972191, |
| "learning_rate": 1.2423061586496477e-07, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 1342720, |
| "step": 10405 |
| }, |
| { |
| "epoch": 9.719887955182072, |
| "grad_norm": 0.34393176436424255, |
| "learning_rate": 1.2020719036005545e-07, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 1343360, |
| "step": 10410 |
| }, |
| { |
| "epoch": 9.72455648926237, |
| "grad_norm": 1.6086621284484863, |
| "learning_rate": 1.1624983761639174e-07, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 1344000, |
| "step": 10415 |
| }, |
| { |
| "epoch": 9.72922502334267, |
| "grad_norm": 0.15175510942935944, |
| "learning_rate": 1.1235856814343914e-07, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1344688, |
| "step": 10420 |
| }, |
| { |
| "epoch": 9.733893557422968, |
| "grad_norm": 0.01866198517382145, |
| "learning_rate": 1.0853339227515635e-07, |
| "loss": 0.086, |
| "num_input_tokens_seen": 1345392, |
| "step": 10425 |
| }, |
| { |
| "epoch": 9.738562091503269, |
| "grad_norm": 0.1492108702659607, |
| "learning_rate": 1.0477432016998967e-07, |
| "loss": 0.02, |
| "num_input_tokens_seen": 1346032, |
| "step": 10430 |
| }, |
| { |
| "epoch": 9.743230625583568, |
| "grad_norm": 2.7389090061187744, |
| "learning_rate": 1.0108136181082862e-07, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 1346624, |
| "step": 10435 |
| }, |
| { |
| "epoch": 9.747899159663866, |
| "grad_norm": 0.3672071397304535, |
| "learning_rate": 9.745452700498925e-08, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 1347248, |
| "step": 10440 |
| }, |
| { |
| "epoch": 9.752567693744165, |
| "grad_norm": 0.11658532172441483, |
| "learning_rate": 9.38938253841809e-08, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 1347856, |
| "step": 10445 |
| }, |
| { |
| "epoch": 9.757236227824464, |
| "grad_norm": 0.11300468444824219, |
| "learning_rate": 9.039926640449226e-08, |
| "loss": 0.0454, |
| "num_input_tokens_seen": 1348480, |
| "step": 10450 |
| }, |
| { |
| "epoch": 9.761904761904763, |
| "grad_norm": 0.8278319239616394, |
| "learning_rate": 8.697085934634696e-08, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 1349216, |
| "step": 10455 |
| }, |
| { |
| "epoch": 9.766573295985062, |
| "grad_norm": 0.8841646313667297, |
| "learning_rate": 8.36086133144981e-08, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 1349856, |
| "step": 10460 |
| }, |
| { |
| "epoch": 9.77124183006536, |
| "grad_norm": 0.09983119368553162, |
| "learning_rate": 8.03125372379948e-08, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 1350480, |
| "step": 10465 |
| }, |
| { |
| "epoch": 9.775910364145659, |
| "grad_norm": 0.1856643557548523, |
| "learning_rate": 7.70826398701574e-08, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 1351152, |
| "step": 10470 |
| }, |
| { |
| "epoch": 9.780578898225958, |
| "grad_norm": 0.3595370352268219, |
| "learning_rate": 7.391892978856341e-08, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1351856, |
| "step": 10475 |
| }, |
| { |
| "epoch": 9.785247432306257, |
| "grad_norm": 0.802082896232605, |
| "learning_rate": 7.082141539500597e-08, |
| "loss": 0.032, |
| "num_input_tokens_seen": 1352560, |
| "step": 10480 |
| }, |
| { |
| "epoch": 9.789915966386555, |
| "grad_norm": 0.0799412950873375, |
| "learning_rate": 6.779010491549942e-08, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 1353200, |
| "step": 10485 |
| }, |
| { |
| "epoch": 9.794584500466854, |
| "grad_norm": 0.3375213146209717, |
| "learning_rate": 6.482500640022926e-08, |
| "loss": 0.0975, |
| "num_input_tokens_seen": 1353792, |
| "step": 10490 |
| }, |
| { |
| "epoch": 9.799253034547153, |
| "grad_norm": 2.9539597034454346, |
| "learning_rate": 6.192612772354945e-08, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1354464, |
| "step": 10495 |
| }, |
| { |
| "epoch": 9.803921568627452, |
| "grad_norm": 0.9493117332458496, |
| "learning_rate": 5.909347658394904e-08, |
| "loss": 0.083, |
| "num_input_tokens_seen": 1355184, |
| "step": 10500 |
| }, |
| { |
| "epoch": 9.80859010270775, |
| "grad_norm": 0.3258625566959381, |
| "learning_rate": 5.632706050404668e-08, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 1355856, |
| "step": 10505 |
| }, |
| { |
| "epoch": 9.81325863678805, |
| "grad_norm": 4.085659503936768, |
| "learning_rate": 5.3626886830557274e-08, |
| "loss": 0.0413, |
| "num_input_tokens_seen": 1356592, |
| "step": 10510 |
| }, |
| { |
| "epoch": 9.817927170868348, |
| "grad_norm": 0.2234000712633133, |
| "learning_rate": 5.099296273427534e-08, |
| "loss": 0.0795, |
| "num_input_tokens_seen": 1357120, |
| "step": 10515 |
| }, |
| { |
| "epoch": 9.822595704948647, |
| "grad_norm": 2.338752031326294, |
| "learning_rate": 4.8425295210058344e-08, |
| "loss": 0.0692, |
| "num_input_tokens_seen": 1357728, |
| "step": 10520 |
| }, |
| { |
| "epoch": 9.827264239028946, |
| "grad_norm": 0.4640015959739685, |
| "learning_rate": 4.592389107681283e-08, |
| "loss": 0.0085, |
| "num_input_tokens_seen": 1358496, |
| "step": 10525 |
| }, |
| { |
| "epoch": 9.831932773109244, |
| "grad_norm": 0.17430801689624786, |
| "learning_rate": 4.3488756977463906e-08, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 1359072, |
| "step": 10530 |
| }, |
| { |
| "epoch": 9.836601307189543, |
| "grad_norm": 9.863551139831543, |
| "learning_rate": 4.111989937894967e-08, |
| "loss": 0.0857, |
| "num_input_tokens_seen": 1359744, |
| "step": 10535 |
| }, |
| { |
| "epoch": 9.841269841269842, |
| "grad_norm": 0.20236068964004517, |
| "learning_rate": 3.881732457219622e-08, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 1360400, |
| "step": 10540 |
| }, |
| { |
| "epoch": 9.84593837535014, |
| "grad_norm": 2.3232908248901367, |
| "learning_rate": 3.65810386721066e-08, |
| "loss": 0.0933, |
| "num_input_tokens_seen": 1361040, |
| "step": 10545 |
| }, |
| { |
| "epoch": 9.85060690943044, |
| "grad_norm": 0.05601672828197479, |
| "learning_rate": 3.441104761753578e-08, |
| "loss": 0.0633, |
| "num_input_tokens_seen": 1361712, |
| "step": 10550 |
| }, |
| { |
| "epoch": 9.855275443510738, |
| "grad_norm": 0.11657658964395523, |
| "learning_rate": 3.230735717129063e-08, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 1362304, |
| "step": 10555 |
| }, |
| { |
| "epoch": 9.859943977591037, |
| "grad_norm": 3.281459093093872, |
| "learning_rate": 3.026997292009392e-08, |
| "loss": 0.0783, |
| "num_input_tokens_seen": 1362912, |
| "step": 10560 |
| }, |
| { |
| "epoch": 9.864612511671336, |
| "grad_norm": 0.4836415946483612, |
| "learning_rate": 2.8298900274589813e-08, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 1363552, |
| "step": 10565 |
| }, |
| { |
| "epoch": 9.869281045751634, |
| "grad_norm": 2.9066414833068848, |
| "learning_rate": 2.6394144469310543e-08, |
| "loss": 0.0595, |
| "num_input_tokens_seen": 1364240, |
| "step": 10570 |
| }, |
| { |
| "epoch": 9.873949579831933, |
| "grad_norm": 1.6726226806640625, |
| "learning_rate": 2.4555710562684796e-08, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 1364864, |
| "step": 10575 |
| }, |
| { |
| "epoch": 9.878618113912232, |
| "grad_norm": 0.22271433472633362, |
| "learning_rate": 2.2783603436998813e-08, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 1365488, |
| "step": 10580 |
| }, |
| { |
| "epoch": 9.88328664799253, |
| "grad_norm": 0.06830593198537827, |
| "learning_rate": 2.1077827798404726e-08, |
| "loss": 0.0053, |
| "num_input_tokens_seen": 1366192, |
| "step": 10585 |
| }, |
| { |
| "epoch": 9.88795518207283, |
| "grad_norm": 1.006521463394165, |
| "learning_rate": 1.943838817689281e-08, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 1366928, |
| "step": 10590 |
| }, |
| { |
| "epoch": 9.892623716153128, |
| "grad_norm": 0.22948722541332245, |
| "learning_rate": 1.786528892629147e-08, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 1367648, |
| "step": 10595 |
| }, |
| { |
| "epoch": 9.897292250233427, |
| "grad_norm": 0.5100280046463013, |
| "learning_rate": 1.6358534224250598e-08, |
| "loss": 0.1655, |
| "num_input_tokens_seen": 1368304, |
| "step": 10600 |
| }, |
| { |
| "epoch": 9.901960784313726, |
| "grad_norm": 0.16577644646167755, |
| "learning_rate": 1.4918128072224924e-08, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 1368928, |
| "step": 10605 |
| }, |
| { |
| "epoch": 9.906629318394025, |
| "grad_norm": 1.0775461196899414, |
| "learning_rate": 1.3544074295473996e-08, |
| "loss": 0.0552, |
| "num_input_tokens_seen": 1369584, |
| "step": 10610 |
| }, |
| { |
| "epoch": 9.911297852474323, |
| "grad_norm": 0.1106322854757309, |
| "learning_rate": 1.2236376543042772e-08, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 1370208, |
| "step": 10615 |
| }, |
| { |
| "epoch": 9.915966386554622, |
| "grad_norm": 1.7885626554489136, |
| "learning_rate": 1.099503828775883e-08, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 1370896, |
| "step": 10620 |
| }, |
| { |
| "epoch": 9.920634920634921, |
| "grad_norm": 3.9994475841522217, |
| "learning_rate": 9.820062826218502e-09, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 1371536, |
| "step": 10625 |
| }, |
| { |
| "epoch": 9.92530345471522, |
| "grad_norm": 0.17582900822162628, |
| "learning_rate": 8.711453278778536e-09, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 1372144, |
| "step": 10630 |
| }, |
| { |
| "epoch": 9.929971988795518, |
| "grad_norm": 0.2206035703420639, |
| "learning_rate": 7.669212589556108e-09, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1372768, |
| "step": 10635 |
| }, |
| { |
| "epoch": 9.934640522875817, |
| "grad_norm": 0.08980569988489151, |
| "learning_rate": 6.693343526403828e-09, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 1373472, |
| "step": 10640 |
| }, |
| { |
| "epoch": 9.939309056956116, |
| "grad_norm": 0.33472374081611633, |
| "learning_rate": 5.78384868091808e-09, |
| "loss": 0.0795, |
| "num_input_tokens_seen": 1374080, |
| "step": 10645 |
| }, |
| { |
| "epoch": 9.943977591036415, |
| "grad_norm": 0.08384134620428085, |
| "learning_rate": 4.940730468427912e-09, |
| "loss": 0.02, |
| "num_input_tokens_seen": 1374736, |
| "step": 10650 |
| }, |
| { |
| "epoch": 9.948646125116714, |
| "grad_norm": 0.3119681477546692, |
| "learning_rate": 4.163991127983935e-09, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 1375456, |
| "step": 10655 |
| }, |
| { |
| "epoch": 9.953314659197012, |
| "grad_norm": 0.5931786894798279, |
| "learning_rate": 3.453632722358324e-09, |
| "loss": 0.023, |
| "num_input_tokens_seen": 1376032, |
| "step": 10660 |
| }, |
| { |
| "epoch": 9.957983193277311, |
| "grad_norm": 3.6127638816833496, |
| "learning_rate": 2.8096571380309413e-09, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 1376720, |
| "step": 10665 |
| }, |
| { |
| "epoch": 9.96265172735761, |
| "grad_norm": 0.20840921998023987, |
| "learning_rate": 2.232066085200435e-09, |
| "loss": 0.0043, |
| "num_input_tokens_seen": 1377392, |
| "step": 10670 |
| }, |
| { |
| "epoch": 9.967320261437909, |
| "grad_norm": 0.34955111145973206, |
| "learning_rate": 1.7208610977620388e-09, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 1378096, |
| "step": 10675 |
| }, |
| { |
| "epoch": 9.971988795518207, |
| "grad_norm": 6.421323776245117, |
| "learning_rate": 1.2760435333103448e-09, |
| "loss": 0.0984, |
| "num_input_tokens_seen": 1378800, |
| "step": 10680 |
| }, |
| { |
| "epoch": 9.976657329598506, |
| "grad_norm": 5.146061420440674, |
| "learning_rate": 8.976145731393049e-10, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 1379520, |
| "step": 10685 |
| }, |
| { |
| "epoch": 9.981325863678805, |
| "grad_norm": 0.22845172882080078, |
| "learning_rate": 5.855752222366783e-10, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 1380032, |
| "step": 10690 |
| }, |
| { |
| "epoch": 9.985994397759104, |
| "grad_norm": 1.768675684928894, |
| "learning_rate": 3.3992630927848213e-10, |
| "loss": 0.0314, |
| "num_input_tokens_seen": 1380720, |
| "step": 10695 |
| }, |
| { |
| "epoch": 9.990662931839402, |
| "grad_norm": 0.8772925734519958, |
| "learning_rate": 1.6066848662621426e-10, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 1381328, |
| "step": 10700 |
| }, |
| { |
| "epoch": 9.995331465919701, |
| "grad_norm": 0.02560342662036419, |
| "learning_rate": 4.780223033795661e-11, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 1382080, |
| "step": 10705 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.5053896903991699, |
| "learning_rate": 1.3278401433947096e-12, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 1382584, |
| "step": 10710 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 1382584, |
| "step": 10710, |
| "total_flos": 6.225713263627469e+16, |
| "train_loss": 0.4241082760738426, |
| "train_runtime": 967.9016, |
| "train_samples_per_second": 22.12, |
| "train_steps_per_second": 11.065 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 10710, |
| "num_input_tokens_seen": 1382584, |
| "num_train_epochs": 10, |
| "save_steps": 536, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.225713263627469e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|