diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,54406 @@ +{ + "best_global_step": 6784, + "best_metric": 0.5017365217208862, + "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_codealpacapy_123_1762496345/checkpoint-6784", + "epoch": 20.0, + "eval_steps": 3392, + "global_step": 33920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00294811320754717, + "grad_norm": 64.7282485961914, + "learning_rate": 1.179245283018868e-08, + "loss": 6.5898, + "num_input_tokens_seen": 2880, + "step": 5 + }, + { + "epoch": 0.00589622641509434, + "grad_norm": 49.12639236450195, + "learning_rate": 2.6533018867924528e-08, + "loss": 6.6231, + "num_input_tokens_seen": 5728, + "step": 10 + }, + { + "epoch": 0.00884433962264151, + "grad_norm": 51.32429885864258, + "learning_rate": 4.127358490566038e-08, + "loss": 6.5842, + "num_input_tokens_seen": 8736, + "step": 15 + }, + { + "epoch": 0.01179245283018868, + "grad_norm": 60.82346725463867, + "learning_rate": 5.601415094339623e-08, + "loss": 6.5648, + "num_input_tokens_seen": 11648, + "step": 20 + }, + { + "epoch": 0.01474056603773585, + "grad_norm": 37.56840133666992, + "learning_rate": 7.075471698113208e-08, + "loss": 6.4989, + "num_input_tokens_seen": 16224, + "step": 25 + }, + { + "epoch": 0.01768867924528302, + "grad_norm": 61.440269470214844, + "learning_rate": 8.549528301886793e-08, + "loss": 6.7535, + "num_input_tokens_seen": 19136, + "step": 30 + }, + { + "epoch": 0.020636792452830188, + "grad_norm": 58.689029693603516, + "learning_rate": 1.0023584905660378e-07, + "loss": 6.3632, + "num_input_tokens_seen": 22208, + "step": 35 + }, + { + "epoch": 0.02358490566037736, + "grad_norm": 68.44844055175781, + "learning_rate": 1.1497641509433962e-07, + "loss": 6.7811, + "num_input_tokens_seen": 24576, + "step": 40 + }, + { + "epoch": 0.02653301886792453, + "grad_norm": 55.26619338989258, + "learning_rate": 1.297169811320755e-07, + "loss": 6.83, + "num_input_tokens_seen": 28800, + "step": 45 + }, + { + "epoch": 0.0294811320754717, + "grad_norm": 40.49510955810547, + "learning_rate": 1.4445754716981135e-07, + "loss": 6.4312, + "num_input_tokens_seen": 31776, + "step": 50 + }, + { + "epoch": 0.03242924528301887, + "grad_norm": 41.20639419555664, + "learning_rate": 1.591981132075472e-07, + "loss": 6.1927, + "num_input_tokens_seen": 35648, + "step": 55 + }, + { + "epoch": 0.03537735849056604, + "grad_norm": 59.88456726074219, + "learning_rate": 1.7393867924528304e-07, + "loss": 6.5442, + "num_input_tokens_seen": 38624, + "step": 60 + }, + { + "epoch": 0.038325471698113206, + "grad_norm": 55.380611419677734, + "learning_rate": 1.886792452830189e-07, + "loss": 6.2018, + "num_input_tokens_seen": 41728, + "step": 65 + }, + { + "epoch": 0.041273584905660375, + "grad_norm": 42.346500396728516, + "learning_rate": 2.0341981132075473e-07, + "loss": 6.0513, + "num_input_tokens_seen": 44672, + "step": 70 + }, + { + "epoch": 0.044221698113207544, + "grad_norm": 40.66829299926758, + "learning_rate": 2.1816037735849058e-07, + "loss": 6.3024, + "num_input_tokens_seen": 48160, + "step": 75 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 50.525733947753906, + "learning_rate": 2.3290094339622643e-07, + "loss": 6.0573, + "num_input_tokens_seen": 51776, + "step": 80 + }, + { + "epoch": 0.05011792452830189, + "grad_norm": 52.028682708740234, + "learning_rate": 2.476415094339623e-07, + "loss": 6.4386, + "num_input_tokens_seen": 54528, + "step": 85 + }, + { + "epoch": 0.05306603773584906, + "grad_norm": 42.27737045288086, + "learning_rate": 2.6238207547169815e-07, + "loss": 6.3051, + "num_input_tokens_seen": 57568, + "step": 90 + }, + { + "epoch": 0.05601415094339623, + "grad_norm": 52.775020599365234, + "learning_rate": 2.7712264150943397e-07, + "loss": 5.7796, + "num_input_tokens_seen": 60256, + "step": 95 + }, + { + "epoch": 0.0589622641509434, + "grad_norm": 68.52125549316406, + "learning_rate": 2.9186320754716984e-07, + "loss": 5.4755, + "num_input_tokens_seen": 62688, + "step": 100 + }, + { + "epoch": 0.061910377358490566, + "grad_norm": 51.297088623046875, + "learning_rate": 3.0660377358490567e-07, + "loss": 5.6559, + "num_input_tokens_seen": 65344, + "step": 105 + }, + { + "epoch": 0.06485849056603774, + "grad_norm": 34.314109802246094, + "learning_rate": 3.213443396226416e-07, + "loss": 5.1255, + "num_input_tokens_seen": 68544, + "step": 110 + }, + { + "epoch": 0.06780660377358491, + "grad_norm": 34.0240592956543, + "learning_rate": 3.3608490566037736e-07, + "loss": 5.5851, + "num_input_tokens_seen": 71328, + "step": 115 + }, + { + "epoch": 0.07075471698113207, + "grad_norm": 33.414390563964844, + "learning_rate": 3.508254716981133e-07, + "loss": 5.1278, + "num_input_tokens_seen": 74240, + "step": 120 + }, + { + "epoch": 0.07370283018867925, + "grad_norm": 46.9061393737793, + "learning_rate": 3.6556603773584905e-07, + "loss": 5.1234, + "num_input_tokens_seen": 77248, + "step": 125 + }, + { + "epoch": 0.07665094339622641, + "grad_norm": 29.65032386779785, + "learning_rate": 3.80306603773585e-07, + "loss": 5.103, + "num_input_tokens_seen": 80704, + "step": 130 + }, + { + "epoch": 0.07959905660377359, + "grad_norm": 38.87522888183594, + "learning_rate": 3.9504716981132075e-07, + "loss": 4.762, + "num_input_tokens_seen": 83840, + "step": 135 + }, + { + "epoch": 0.08254716981132075, + "grad_norm": 41.98786926269531, + "learning_rate": 4.097877358490567e-07, + "loss": 5.2388, + "num_input_tokens_seen": 86560, + "step": 140 + }, + { + "epoch": 0.08549528301886793, + "grad_norm": 38.62727355957031, + "learning_rate": 4.2452830188679244e-07, + "loss": 5.1523, + "num_input_tokens_seen": 89376, + "step": 145 + }, + { + "epoch": 0.08844339622641509, + "grad_norm": 35.342803955078125, + "learning_rate": 4.3926886792452837e-07, + "loss": 4.4728, + "num_input_tokens_seen": 95904, + "step": 150 + }, + { + "epoch": 0.09139150943396226, + "grad_norm": 32.85224914550781, + "learning_rate": 4.5400943396226414e-07, + "loss": 4.9182, + "num_input_tokens_seen": 99520, + "step": 155 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 31.7283878326416, + "learning_rate": 4.6875000000000006e-07, + "loss": 4.8899, + "num_input_tokens_seen": 102656, + "step": 160 + }, + { + "epoch": 0.0972877358490566, + "grad_norm": 31.581472396850586, + "learning_rate": 4.834905660377359e-07, + "loss": 4.7307, + "num_input_tokens_seen": 105312, + "step": 165 + }, + { + "epoch": 0.10023584905660378, + "grad_norm": 27.684341430664062, + "learning_rate": 4.982311320754717e-07, + "loss": 4.9404, + "num_input_tokens_seen": 108160, + "step": 170 + }, + { + "epoch": 0.10318396226415094, + "grad_norm": 30.44116973876953, + "learning_rate": 5.129716981132076e-07, + "loss": 4.2148, + "num_input_tokens_seen": 111680, + "step": 175 + }, + { + "epoch": 0.10613207547169812, + "grad_norm": 28.67049789428711, + "learning_rate": 5.277122641509435e-07, + "loss": 4.8944, + "num_input_tokens_seen": 114912, + "step": 180 + }, + { + "epoch": 0.10908018867924528, + "grad_norm": 30.86630630493164, + "learning_rate": 5.424528301886793e-07, + "loss": 4.2355, + "num_input_tokens_seen": 117376, + "step": 185 + }, + { + "epoch": 0.11202830188679246, + "grad_norm": 38.3621940612793, + "learning_rate": 5.571933962264151e-07, + "loss": 4.059, + "num_input_tokens_seen": 120416, + "step": 190 + }, + { + "epoch": 0.11497641509433962, + "grad_norm": 25.196552276611328, + "learning_rate": 5.71933962264151e-07, + "loss": 4.4158, + "num_input_tokens_seen": 123424, + "step": 195 + }, + { + "epoch": 0.1179245283018868, + "grad_norm": 23.164745330810547, + "learning_rate": 5.866745283018868e-07, + "loss": 4.3497, + "num_input_tokens_seen": 127136, + "step": 200 + }, + { + "epoch": 0.12087264150943396, + "grad_norm": 25.149311065673828, + "learning_rate": 6.014150943396227e-07, + "loss": 4.2567, + "num_input_tokens_seen": 131840, + "step": 205 + }, + { + "epoch": 0.12382075471698113, + "grad_norm": 21.28417205810547, + "learning_rate": 6.161556603773585e-07, + "loss": 3.9834, + "num_input_tokens_seen": 135360, + "step": 210 + }, + { + "epoch": 0.1267688679245283, + "grad_norm": 29.37235450744629, + "learning_rate": 6.308962264150945e-07, + "loss": 4.1029, + "num_input_tokens_seen": 139328, + "step": 215 + }, + { + "epoch": 0.12971698113207547, + "grad_norm": 28.774032592773438, + "learning_rate": 6.456367924528302e-07, + "loss": 4.156, + "num_input_tokens_seen": 142976, + "step": 220 + }, + { + "epoch": 0.13266509433962265, + "grad_norm": 23.721956253051758, + "learning_rate": 6.603773584905661e-07, + "loss": 3.4806, + "num_input_tokens_seen": 146528, + "step": 225 + }, + { + "epoch": 0.13561320754716982, + "grad_norm": 19.850208282470703, + "learning_rate": 6.75117924528302e-07, + "loss": 3.7368, + "num_input_tokens_seen": 152192, + "step": 230 + }, + { + "epoch": 0.13856132075471697, + "grad_norm": 34.27486801147461, + "learning_rate": 6.898584905660379e-07, + "loss": 4.0799, + "num_input_tokens_seen": 154624, + "step": 235 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 19.940319061279297, + "learning_rate": 7.045990566037736e-07, + "loss": 4.2881, + "num_input_tokens_seen": 157920, + "step": 240 + }, + { + "epoch": 0.14445754716981132, + "grad_norm": 25.46551513671875, + "learning_rate": 7.193396226415095e-07, + "loss": 3.5618, + "num_input_tokens_seen": 160640, + "step": 245 + }, + { + "epoch": 0.1474056603773585, + "grad_norm": 27.28227996826172, + "learning_rate": 7.340801886792454e-07, + "loss": 3.6146, + "num_input_tokens_seen": 164448, + "step": 250 + }, + { + "epoch": 0.15035377358490565, + "grad_norm": 33.54536056518555, + "learning_rate": 7.488207547169812e-07, + "loss": 3.4774, + "num_input_tokens_seen": 168192, + "step": 255 + }, + { + "epoch": 0.15330188679245282, + "grad_norm": 30.10123634338379, + "learning_rate": 7.63561320754717e-07, + "loss": 3.5721, + "num_input_tokens_seen": 172192, + "step": 260 + }, + { + "epoch": 0.15625, + "grad_norm": 24.84919548034668, + "learning_rate": 7.783018867924529e-07, + "loss": 3.4567, + "num_input_tokens_seen": 175008, + "step": 265 + }, + { + "epoch": 0.15919811320754718, + "grad_norm": 24.56399917602539, + "learning_rate": 7.930424528301888e-07, + "loss": 3.2578, + "num_input_tokens_seen": 177888, + "step": 270 + }, + { + "epoch": 0.16214622641509435, + "grad_norm": 22.678579330444336, + "learning_rate": 8.077830188679246e-07, + "loss": 3.0722, + "num_input_tokens_seen": 181472, + "step": 275 + }, + { + "epoch": 0.1650943396226415, + "grad_norm": 22.689022064208984, + "learning_rate": 8.225235849056605e-07, + "loss": 3.1394, + "num_input_tokens_seen": 187296, + "step": 280 + }, + { + "epoch": 0.16804245283018868, + "grad_norm": 26.912845611572266, + "learning_rate": 8.372641509433963e-07, + "loss": 3.2233, + "num_input_tokens_seen": 189600, + "step": 285 + }, + { + "epoch": 0.17099056603773585, + "grad_norm": 19.499067306518555, + "learning_rate": 8.520047169811321e-07, + "loss": 3.3117, + "num_input_tokens_seen": 192256, + "step": 290 + }, + { + "epoch": 0.17393867924528303, + "grad_norm": 18.910808563232422, + "learning_rate": 8.66745283018868e-07, + "loss": 3.458, + "num_input_tokens_seen": 195168, + "step": 295 + }, + { + "epoch": 0.17688679245283018, + "grad_norm": 19.73526954650879, + "learning_rate": 8.814858490566039e-07, + "loss": 3.4695, + "num_input_tokens_seen": 198368, + "step": 300 + }, + { + "epoch": 0.17983490566037735, + "grad_norm": 27.31045913696289, + "learning_rate": 8.962264150943397e-07, + "loss": 3.3337, + "num_input_tokens_seen": 201184, + "step": 305 + }, + { + "epoch": 0.18278301886792453, + "grad_norm": 17.952632904052734, + "learning_rate": 9.109669811320755e-07, + "loss": 3.3536, + "num_input_tokens_seen": 204000, + "step": 310 + }, + { + "epoch": 0.1857311320754717, + "grad_norm": 36.304290771484375, + "learning_rate": 9.257075471698114e-07, + "loss": 3.1915, + "num_input_tokens_seen": 207136, + "step": 315 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 24.410093307495117, + "learning_rate": 9.404481132075473e-07, + "loss": 2.6377, + "num_input_tokens_seen": 210112, + "step": 320 + }, + { + "epoch": 0.19162735849056603, + "grad_norm": 20.085615158081055, + "learning_rate": 9.551886792452833e-07, + "loss": 2.676, + "num_input_tokens_seen": 212736, + "step": 325 + }, + { + "epoch": 0.1945754716981132, + "grad_norm": 24.513856887817383, + "learning_rate": 9.699292452830188e-07, + "loss": 2.634, + "num_input_tokens_seen": 215584, + "step": 330 + }, + { + "epoch": 0.19752358490566038, + "grad_norm": 17.04359245300293, + "learning_rate": 9.846698113207548e-07, + "loss": 3.4525, + "num_input_tokens_seen": 218912, + "step": 335 + }, + { + "epoch": 0.20047169811320756, + "grad_norm": 16.813438415527344, + "learning_rate": 9.994103773584906e-07, + "loss": 2.8842, + "num_input_tokens_seen": 222432, + "step": 340 + }, + { + "epoch": 0.2034198113207547, + "grad_norm": 18.97490119934082, + "learning_rate": 1.0141509433962265e-06, + "loss": 2.9732, + "num_input_tokens_seen": 226240, + "step": 345 + }, + { + "epoch": 0.20636792452830188, + "grad_norm": 18.00664710998535, + "learning_rate": 1.0288915094339623e-06, + "loss": 3.066, + "num_input_tokens_seen": 229888, + "step": 350 + }, + { + "epoch": 0.20931603773584906, + "grad_norm": 43.31511688232422, + "learning_rate": 1.043632075471698e-06, + "loss": 2.8984, + "num_input_tokens_seen": 234176, + "step": 355 + }, + { + "epoch": 0.21226415094339623, + "grad_norm": 19.27219009399414, + "learning_rate": 1.058372641509434e-06, + "loss": 3.0836, + "num_input_tokens_seen": 237696, + "step": 360 + }, + { + "epoch": 0.21521226415094338, + "grad_norm": 16.996538162231445, + "learning_rate": 1.07311320754717e-06, + "loss": 2.8502, + "num_input_tokens_seen": 241568, + "step": 365 + }, + { + "epoch": 0.21816037735849056, + "grad_norm": 19.156700134277344, + "learning_rate": 1.0878537735849056e-06, + "loss": 2.7408, + "num_input_tokens_seen": 243968, + "step": 370 + }, + { + "epoch": 0.22110849056603774, + "grad_norm": 24.887527465820312, + "learning_rate": 1.1025943396226416e-06, + "loss": 2.5705, + "num_input_tokens_seen": 246656, + "step": 375 + }, + { + "epoch": 0.2240566037735849, + "grad_norm": 22.936687469482422, + "learning_rate": 1.1173349056603773e-06, + "loss": 2.7849, + "num_input_tokens_seen": 250528, + "step": 380 + }, + { + "epoch": 0.2270047169811321, + "grad_norm": 21.138553619384766, + "learning_rate": 1.1320754716981133e-06, + "loss": 2.6605, + "num_input_tokens_seen": 253280, + "step": 385 + }, + { + "epoch": 0.22995283018867924, + "grad_norm": 29.792118072509766, + "learning_rate": 1.1468160377358493e-06, + "loss": 2.659, + "num_input_tokens_seen": 256448, + "step": 390 + }, + { + "epoch": 0.2329009433962264, + "grad_norm": 24.42670249938965, + "learning_rate": 1.1615566037735849e-06, + "loss": 2.8954, + "num_input_tokens_seen": 259392, + "step": 395 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 14.09592056274414, + "learning_rate": 1.1762971698113208e-06, + "loss": 2.6005, + "num_input_tokens_seen": 262432, + "step": 400 + }, + { + "epoch": 0.23879716981132076, + "grad_norm": 18.65346908569336, + "learning_rate": 1.1910377358490568e-06, + "loss": 2.4863, + "num_input_tokens_seen": 265088, + "step": 405 + }, + { + "epoch": 0.2417452830188679, + "grad_norm": 21.989351272583008, + "learning_rate": 1.2057783018867926e-06, + "loss": 2.882, + "num_input_tokens_seen": 268480, + "step": 410 + }, + { + "epoch": 0.2446933962264151, + "grad_norm": 13.782401084899902, + "learning_rate": 1.2205188679245284e-06, + "loss": 2.4369, + "num_input_tokens_seen": 272448, + "step": 415 + }, + { + "epoch": 0.24764150943396226, + "grad_norm": 15.481195449829102, + "learning_rate": 1.2352594339622641e-06, + "loss": 2.3918, + "num_input_tokens_seen": 275168, + "step": 420 + }, + { + "epoch": 0.2505896226415094, + "grad_norm": 19.427631378173828, + "learning_rate": 1.25e-06, + "loss": 2.3237, + "num_input_tokens_seen": 277888, + "step": 425 + }, + { + "epoch": 0.2535377358490566, + "grad_norm": 27.305288314819336, + "learning_rate": 1.264740566037736e-06, + "loss": 2.6138, + "num_input_tokens_seen": 281280, + "step": 430 + }, + { + "epoch": 0.25648584905660377, + "grad_norm": 31.20403289794922, + "learning_rate": 1.2794811320754718e-06, + "loss": 2.642, + "num_input_tokens_seen": 283904, + "step": 435 + }, + { + "epoch": 0.25943396226415094, + "grad_norm": 25.38358497619629, + "learning_rate": 1.2942216981132078e-06, + "loss": 2.586, + "num_input_tokens_seen": 286528, + "step": 440 + }, + { + "epoch": 0.2623820754716981, + "grad_norm": 18.536182403564453, + "learning_rate": 1.3089622641509436e-06, + "loss": 2.555, + "num_input_tokens_seen": 290048, + "step": 445 + }, + { + "epoch": 0.2653301886792453, + "grad_norm": 17.035594940185547, + "learning_rate": 1.3237028301886792e-06, + "loss": 2.4563, + "num_input_tokens_seen": 294080, + "step": 450 + }, + { + "epoch": 0.26827830188679247, + "grad_norm": 28.207172393798828, + "learning_rate": 1.3384433962264151e-06, + "loss": 2.7682, + "num_input_tokens_seen": 296608, + "step": 455 + }, + { + "epoch": 0.27122641509433965, + "grad_norm": 18.966447830200195, + "learning_rate": 1.353183962264151e-06, + "loss": 2.4682, + "num_input_tokens_seen": 299136, + "step": 460 + }, + { + "epoch": 0.27417452830188677, + "grad_norm": 23.712392807006836, + "learning_rate": 1.3679245283018869e-06, + "loss": 2.3755, + "num_input_tokens_seen": 302240, + "step": 465 + }, + { + "epoch": 0.27712264150943394, + "grad_norm": 20.29737663269043, + "learning_rate": 1.3826650943396229e-06, + "loss": 2.2079, + "num_input_tokens_seen": 305888, + "step": 470 + }, + { + "epoch": 0.2800707547169811, + "grad_norm": 10.936732292175293, + "learning_rate": 1.3974056603773586e-06, + "loss": 2.7853, + "num_input_tokens_seen": 308992, + "step": 475 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 18.892715454101562, + "learning_rate": 1.4121462264150946e-06, + "loss": 2.7675, + "num_input_tokens_seen": 311360, + "step": 480 + }, + { + "epoch": 0.28596698113207547, + "grad_norm": 15.853782653808594, + "learning_rate": 1.4268867924528304e-06, + "loss": 2.2163, + "num_input_tokens_seen": 313824, + "step": 485 + }, + { + "epoch": 0.28891509433962265, + "grad_norm": 24.81093978881836, + "learning_rate": 1.4416273584905664e-06, + "loss": 2.5636, + "num_input_tokens_seen": 316640, + "step": 490 + }, + { + "epoch": 0.2918632075471698, + "grad_norm": 17.189212799072266, + "learning_rate": 1.456367924528302e-06, + "loss": 2.1607, + "num_input_tokens_seen": 320576, + "step": 495 + }, + { + "epoch": 0.294811320754717, + "grad_norm": 15.070216178894043, + "learning_rate": 1.4711084905660377e-06, + "loss": 2.4592, + "num_input_tokens_seen": 324288, + "step": 500 + }, + { + "epoch": 0.2977594339622642, + "grad_norm": 17.137371063232422, + "learning_rate": 1.4858490566037737e-06, + "loss": 2.2579, + "num_input_tokens_seen": 327072, + "step": 505 + }, + { + "epoch": 0.3007075471698113, + "grad_norm": 19.067598342895508, + "learning_rate": 1.5005896226415096e-06, + "loss": 2.2751, + "num_input_tokens_seen": 329408, + "step": 510 + }, + { + "epoch": 0.30365566037735847, + "grad_norm": 14.832257270812988, + "learning_rate": 1.5153301886792454e-06, + "loss": 2.1318, + "num_input_tokens_seen": 332704, + "step": 515 + }, + { + "epoch": 0.30660377358490565, + "grad_norm": 23.307889938354492, + "learning_rate": 1.5300707547169814e-06, + "loss": 2.4837, + "num_input_tokens_seen": 335552, + "step": 520 + }, + { + "epoch": 0.3095518867924528, + "grad_norm": 17.52256202697754, + "learning_rate": 1.5448113207547172e-06, + "loss": 2.3355, + "num_input_tokens_seen": 338560, + "step": 525 + }, + { + "epoch": 0.3125, + "grad_norm": 23.74030303955078, + "learning_rate": 1.5595518867924531e-06, + "loss": 2.1778, + "num_input_tokens_seen": 342592, + "step": 530 + }, + { + "epoch": 0.3154481132075472, + "grad_norm": 13.900066375732422, + "learning_rate": 1.574292452830189e-06, + "loss": 1.9473, + "num_input_tokens_seen": 346176, + "step": 535 + }, + { + "epoch": 0.31839622641509435, + "grad_norm": 18.985767364501953, + "learning_rate": 1.5890330188679245e-06, + "loss": 1.8161, + "num_input_tokens_seen": 350464, + "step": 540 + }, + { + "epoch": 0.32134433962264153, + "grad_norm": 11.604238510131836, + "learning_rate": 1.6037735849056604e-06, + "loss": 2.2977, + "num_input_tokens_seen": 353856, + "step": 545 + }, + { + "epoch": 0.3242924528301887, + "grad_norm": 16.48134422302246, + "learning_rate": 1.6185141509433964e-06, + "loss": 1.9008, + "num_input_tokens_seen": 356608, + "step": 550 + }, + { + "epoch": 0.3272405660377358, + "grad_norm": 21.324331283569336, + "learning_rate": 1.6332547169811322e-06, + "loss": 2.6972, + "num_input_tokens_seen": 359520, + "step": 555 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 14.078737258911133, + "learning_rate": 1.6479952830188682e-06, + "loss": 2.1941, + "num_input_tokens_seen": 363616, + "step": 560 + }, + { + "epoch": 0.3331367924528302, + "grad_norm": 33.28125762939453, + "learning_rate": 1.662735849056604e-06, + "loss": 2.5141, + "num_input_tokens_seen": 366336, + "step": 565 + }, + { + "epoch": 0.33608490566037735, + "grad_norm": 13.483412742614746, + "learning_rate": 1.67747641509434e-06, + "loss": 2.0076, + "num_input_tokens_seen": 369632, + "step": 570 + }, + { + "epoch": 0.33903301886792453, + "grad_norm": 13.411723136901855, + "learning_rate": 1.6922169811320757e-06, + "loss": 1.975, + "num_input_tokens_seen": 373280, + "step": 575 + }, + { + "epoch": 0.3419811320754717, + "grad_norm": 22.47404670715332, + "learning_rate": 1.7069575471698112e-06, + "loss": 2.4221, + "num_input_tokens_seen": 378176, + "step": 580 + }, + { + "epoch": 0.3449292452830189, + "grad_norm": 30.219545364379883, + "learning_rate": 1.7216981132075472e-06, + "loss": 1.9371, + "num_input_tokens_seen": 380992, + "step": 585 + }, + { + "epoch": 0.34787735849056606, + "grad_norm": 20.832317352294922, + "learning_rate": 1.736438679245283e-06, + "loss": 2.1914, + "num_input_tokens_seen": 383840, + "step": 590 + }, + { + "epoch": 0.35082547169811323, + "grad_norm": 12.889510154724121, + "learning_rate": 1.751179245283019e-06, + "loss": 1.7578, + "num_input_tokens_seen": 386720, + "step": 595 + }, + { + "epoch": 0.35377358490566035, + "grad_norm": 26.015750885009766, + "learning_rate": 1.765919811320755e-06, + "loss": 2.049, + "num_input_tokens_seen": 389664, + "step": 600 + }, + { + "epoch": 0.35672169811320753, + "grad_norm": 11.565692901611328, + "learning_rate": 1.7806603773584907e-06, + "loss": 2.5405, + "num_input_tokens_seen": 393024, + "step": 605 + }, + { + "epoch": 0.3596698113207547, + "grad_norm": 22.778697967529297, + "learning_rate": 1.7954009433962267e-06, + "loss": 1.9969, + "num_input_tokens_seen": 396576, + "step": 610 + }, + { + "epoch": 0.3626179245283019, + "grad_norm": 16.982954025268555, + "learning_rate": 1.8101415094339625e-06, + "loss": 2.3094, + "num_input_tokens_seen": 399488, + "step": 615 + }, + { + "epoch": 0.36556603773584906, + "grad_norm": 23.14407730102539, + "learning_rate": 1.8248820754716984e-06, + "loss": 1.9377, + "num_input_tokens_seen": 402496, + "step": 620 + }, + { + "epoch": 0.36851415094339623, + "grad_norm": 22.094654083251953, + "learning_rate": 1.839622641509434e-06, + "loss": 1.7228, + "num_input_tokens_seen": 405568, + "step": 625 + }, + { + "epoch": 0.3714622641509434, + "grad_norm": 27.078800201416016, + "learning_rate": 1.8543632075471698e-06, + "loss": 1.8129, + "num_input_tokens_seen": 408416, + "step": 630 + }, + { + "epoch": 0.3744103773584906, + "grad_norm": 26.680383682250977, + "learning_rate": 1.8691037735849057e-06, + "loss": 1.7833, + "num_input_tokens_seen": 411360, + "step": 635 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 13.581195831298828, + "learning_rate": 1.8838443396226417e-06, + "loss": 1.6358, + "num_input_tokens_seen": 414112, + "step": 640 + }, + { + "epoch": 0.3803066037735849, + "grad_norm": 23.067106246948242, + "learning_rate": 1.8985849056603775e-06, + "loss": 1.7158, + "num_input_tokens_seen": 417408, + "step": 645 + }, + { + "epoch": 0.38325471698113206, + "grad_norm": 21.167978286743164, + "learning_rate": 1.9133254716981133e-06, + "loss": 1.8702, + "num_input_tokens_seen": 420640, + "step": 650 + }, + { + "epoch": 0.38620283018867924, + "grad_norm": 24.932361602783203, + "learning_rate": 1.9280660377358494e-06, + "loss": 1.9492, + "num_input_tokens_seen": 422688, + "step": 655 + }, + { + "epoch": 0.3891509433962264, + "grad_norm": 14.881708145141602, + "learning_rate": 1.9428066037735852e-06, + "loss": 2.083, + "num_input_tokens_seen": 427136, + "step": 660 + }, + { + "epoch": 0.3920990566037736, + "grad_norm": 24.323211669921875, + "learning_rate": 1.957547169811321e-06, + "loss": 1.7583, + "num_input_tokens_seen": 430400, + "step": 665 + }, + { + "epoch": 0.39504716981132076, + "grad_norm": 12.40609359741211, + "learning_rate": 1.9722877358490568e-06, + "loss": 1.7232, + "num_input_tokens_seen": 433632, + "step": 670 + }, + { + "epoch": 0.39799528301886794, + "grad_norm": 16.017995834350586, + "learning_rate": 1.9870283018867925e-06, + "loss": 1.8499, + "num_input_tokens_seen": 436960, + "step": 675 + }, + { + "epoch": 0.4009433962264151, + "grad_norm": 14.273941040039062, + "learning_rate": 2.0017688679245283e-06, + "loss": 1.5063, + "num_input_tokens_seen": 439968, + "step": 680 + }, + { + "epoch": 0.40389150943396224, + "grad_norm": 20.851783752441406, + "learning_rate": 2.0165094339622645e-06, + "loss": 1.5491, + "num_input_tokens_seen": 443168, + "step": 685 + }, + { + "epoch": 0.4068396226415094, + "grad_norm": 14.981064796447754, + "learning_rate": 2.0312500000000002e-06, + "loss": 1.5964, + "num_input_tokens_seen": 445984, + "step": 690 + }, + { + "epoch": 0.4097877358490566, + "grad_norm": 22.555187225341797, + "learning_rate": 2.045990566037736e-06, + "loss": 1.7867, + "num_input_tokens_seen": 449280, + "step": 695 + }, + { + "epoch": 0.41273584905660377, + "grad_norm": 18.690513610839844, + "learning_rate": 2.0607311320754718e-06, + "loss": 1.5458, + "num_input_tokens_seen": 452832, + "step": 700 + }, + { + "epoch": 0.41568396226415094, + "grad_norm": 16.948097229003906, + "learning_rate": 2.075471698113208e-06, + "loss": 1.3588, + "num_input_tokens_seen": 455360, + "step": 705 + }, + { + "epoch": 0.4186320754716981, + "grad_norm": 11.862958908081055, + "learning_rate": 2.0902122641509437e-06, + "loss": 1.3306, + "num_input_tokens_seen": 458944, + "step": 710 + }, + { + "epoch": 0.4215801886792453, + "grad_norm": 13.978707313537598, + "learning_rate": 2.1049528301886795e-06, + "loss": 1.4604, + "num_input_tokens_seen": 462080, + "step": 715 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 21.152074813842773, + "learning_rate": 2.1196933962264153e-06, + "loss": 1.3827, + "num_input_tokens_seen": 465888, + "step": 720 + }, + { + "epoch": 0.42747641509433965, + "grad_norm": 36.478572845458984, + "learning_rate": 2.134433962264151e-06, + "loss": 1.6747, + "num_input_tokens_seen": 468928, + "step": 725 + }, + { + "epoch": 0.43042452830188677, + "grad_norm": 15.41918659210205, + "learning_rate": 2.149174528301887e-06, + "loss": 0.9826, + "num_input_tokens_seen": 472032, + "step": 730 + }, + { + "epoch": 0.43337264150943394, + "grad_norm": 12.585308074951172, + "learning_rate": 2.163915094339623e-06, + "loss": 1.2454, + "num_input_tokens_seen": 474816, + "step": 735 + }, + { + "epoch": 0.4363207547169811, + "grad_norm": 15.616990089416504, + "learning_rate": 2.1786556603773588e-06, + "loss": 1.61, + "num_input_tokens_seen": 477344, + "step": 740 + }, + { + "epoch": 0.4392688679245283, + "grad_norm": 14.420464515686035, + "learning_rate": 2.1933962264150945e-06, + "loss": 1.2651, + "num_input_tokens_seen": 480096, + "step": 745 + }, + { + "epoch": 0.44221698113207547, + "grad_norm": 9.266770362854004, + "learning_rate": 2.2081367924528303e-06, + "loss": 0.9204, + "num_input_tokens_seen": 483712, + "step": 750 + }, + { + "epoch": 0.44516509433962265, + "grad_norm": 14.800943374633789, + "learning_rate": 2.2228773584905665e-06, + "loss": 0.9592, + "num_input_tokens_seen": 487072, + "step": 755 + }, + { + "epoch": 0.4481132075471698, + "grad_norm": 12.020878791809082, + "learning_rate": 2.237617924528302e-06, + "loss": 1.185, + "num_input_tokens_seen": 490432, + "step": 760 + }, + { + "epoch": 0.451061320754717, + "grad_norm": 13.600262641906738, + "learning_rate": 2.252358490566038e-06, + "loss": 0.8052, + "num_input_tokens_seen": 493440, + "step": 765 + }, + { + "epoch": 0.4540094339622642, + "grad_norm": 31.355731964111328, + "learning_rate": 2.267099056603774e-06, + "loss": 0.947, + "num_input_tokens_seen": 496000, + "step": 770 + }, + { + "epoch": 0.4569575471698113, + "grad_norm": 13.489664077758789, + "learning_rate": 2.2818396226415096e-06, + "loss": 0.9103, + "num_input_tokens_seen": 499904, + "step": 775 + }, + { + "epoch": 0.45990566037735847, + "grad_norm": 13.685105323791504, + "learning_rate": 2.2965801886792453e-06, + "loss": 0.9831, + "num_input_tokens_seen": 504256, + "step": 780 + }, + { + "epoch": 0.46285377358490565, + "grad_norm": 10.982120513916016, + "learning_rate": 2.3113207547169815e-06, + "loss": 0.7673, + "num_input_tokens_seen": 507616, + "step": 785 + }, + { + "epoch": 0.4658018867924528, + "grad_norm": 7.945463180541992, + "learning_rate": 2.3260613207547173e-06, + "loss": 1.0915, + "num_input_tokens_seen": 510720, + "step": 790 + }, + { + "epoch": 0.46875, + "grad_norm": 10.44337272644043, + "learning_rate": 2.340801886792453e-06, + "loss": 0.8194, + "num_input_tokens_seen": 514016, + "step": 795 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 6.346359729766846, + "learning_rate": 2.355542452830189e-06, + "loss": 0.8714, + "num_input_tokens_seen": 517024, + "step": 800 + }, + { + "epoch": 0.47464622641509435, + "grad_norm": 7.394376277923584, + "learning_rate": 2.3702830188679246e-06, + "loss": 1.102, + "num_input_tokens_seen": 520032, + "step": 805 + }, + { + "epoch": 0.47759433962264153, + "grad_norm": 9.357361793518066, + "learning_rate": 2.3850235849056604e-06, + "loss": 0.9178, + "num_input_tokens_seen": 523232, + "step": 810 + }, + { + "epoch": 0.4805424528301887, + "grad_norm": 9.115985870361328, + "learning_rate": 2.3997641509433966e-06, + "loss": 0.8341, + "num_input_tokens_seen": 526080, + "step": 815 + }, + { + "epoch": 0.4834905660377358, + "grad_norm": 13.859272956848145, + "learning_rate": 2.4145047169811323e-06, + "loss": 0.8807, + "num_input_tokens_seen": 528384, + "step": 820 + }, + { + "epoch": 0.486438679245283, + "grad_norm": 9.01256275177002, + "learning_rate": 2.429245283018868e-06, + "loss": 0.699, + "num_input_tokens_seen": 531808, + "step": 825 + }, + { + "epoch": 0.4893867924528302, + "grad_norm": 11.935478210449219, + "learning_rate": 2.443985849056604e-06, + "loss": 0.9971, + "num_input_tokens_seen": 535392, + "step": 830 + }, + { + "epoch": 0.49233490566037735, + "grad_norm": 11.69451904296875, + "learning_rate": 2.45872641509434e-06, + "loss": 0.9152, + "num_input_tokens_seen": 538752, + "step": 835 + }, + { + "epoch": 0.49528301886792453, + "grad_norm": 11.001533508300781, + "learning_rate": 2.473466981132076e-06, + "loss": 0.8088, + "num_input_tokens_seen": 541920, + "step": 840 + }, + { + "epoch": 0.4982311320754717, + "grad_norm": 8.332868576049805, + "learning_rate": 2.4882075471698116e-06, + "loss": 0.856, + "num_input_tokens_seen": 545024, + "step": 845 + }, + { + "epoch": 0.5011792452830188, + "grad_norm": 9.973461151123047, + "learning_rate": 2.5029481132075474e-06, + "loss": 0.8352, + "num_input_tokens_seen": 548384, + "step": 850 + }, + { + "epoch": 0.504127358490566, + "grad_norm": 8.795652389526367, + "learning_rate": 2.517688679245283e-06, + "loss": 0.8574, + "num_input_tokens_seen": 551712, + "step": 855 + }, + { + "epoch": 0.5070754716981132, + "grad_norm": 5.081294536590576, + "learning_rate": 2.532429245283019e-06, + "loss": 0.7831, + "num_input_tokens_seen": 554688, + "step": 860 + }, + { + "epoch": 0.5100235849056604, + "grad_norm": 11.282505989074707, + "learning_rate": 2.547169811320755e-06, + "loss": 1.0198, + "num_input_tokens_seen": 557888, + "step": 865 + }, + { + "epoch": 0.5129716981132075, + "grad_norm": 5.791973114013672, + "learning_rate": 2.561910377358491e-06, + "loss": 0.7339, + "num_input_tokens_seen": 561856, + "step": 870 + }, + { + "epoch": 0.5159198113207547, + "grad_norm": 5.20530891418457, + "learning_rate": 2.5766509433962266e-06, + "loss": 0.711, + "num_input_tokens_seen": 565024, + "step": 875 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 5.657922744750977, + "learning_rate": 2.5913915094339624e-06, + "loss": 0.9388, + "num_input_tokens_seen": 568480, + "step": 880 + }, + { + "epoch": 0.5218160377358491, + "grad_norm": 12.070975303649902, + "learning_rate": 2.6061320754716986e-06, + "loss": 1.1166, + "num_input_tokens_seen": 572096, + "step": 885 + }, + { + "epoch": 0.5247641509433962, + "grad_norm": 6.184645652770996, + "learning_rate": 2.6208726415094343e-06, + "loss": 0.9136, + "num_input_tokens_seen": 575520, + "step": 890 + }, + { + "epoch": 0.5277122641509434, + "grad_norm": 7.415177822113037, + "learning_rate": 2.63561320754717e-06, + "loss": 0.748, + "num_input_tokens_seen": 578848, + "step": 895 + }, + { + "epoch": 0.5306603773584906, + "grad_norm": 7.983150005340576, + "learning_rate": 2.650353773584906e-06, + "loss": 0.6337, + "num_input_tokens_seen": 583008, + "step": 900 + }, + { + "epoch": 0.5336084905660378, + "grad_norm": 16.439682006835938, + "learning_rate": 2.665094339622642e-06, + "loss": 0.7095, + "num_input_tokens_seen": 586432, + "step": 905 + }, + { + "epoch": 0.5365566037735849, + "grad_norm": 7.634627819061279, + "learning_rate": 2.679834905660378e-06, + "loss": 0.877, + "num_input_tokens_seen": 589568, + "step": 910 + }, + { + "epoch": 0.5395047169811321, + "grad_norm": 7.967339038848877, + "learning_rate": 2.694575471698113e-06, + "loss": 0.7737, + "num_input_tokens_seen": 592416, + "step": 915 + }, + { + "epoch": 0.5424528301886793, + "grad_norm": 7.440727233886719, + "learning_rate": 2.709316037735849e-06, + "loss": 0.4487, + "num_input_tokens_seen": 595552, + "step": 920 + }, + { + "epoch": 0.5454009433962265, + "grad_norm": 5.393298625946045, + "learning_rate": 2.724056603773585e-06, + "loss": 0.5329, + "num_input_tokens_seen": 599104, + "step": 925 + }, + { + "epoch": 0.5483490566037735, + "grad_norm": 5.736482620239258, + "learning_rate": 2.738797169811321e-06, + "loss": 0.5568, + "num_input_tokens_seen": 602304, + "step": 930 + }, + { + "epoch": 0.5512971698113207, + "grad_norm": 18.674306869506836, + "learning_rate": 2.7535377358490567e-06, + "loss": 0.947, + "num_input_tokens_seen": 605376, + "step": 935 + }, + { + "epoch": 0.5542452830188679, + "grad_norm": 6.868896961212158, + "learning_rate": 2.7682783018867925e-06, + "loss": 0.9636, + "num_input_tokens_seen": 607648, + "step": 940 + }, + { + "epoch": 0.5571933962264151, + "grad_norm": 3.72993803024292, + "learning_rate": 2.7830188679245286e-06, + "loss": 0.7103, + "num_input_tokens_seen": 611232, + "step": 945 + }, + { + "epoch": 0.5601415094339622, + "grad_norm": 6.169341087341309, + "learning_rate": 2.7977594339622644e-06, + "loss": 0.6143, + "num_input_tokens_seen": 615776, + "step": 950 + }, + { + "epoch": 0.5630896226415094, + "grad_norm": 18.689990997314453, + "learning_rate": 2.8125e-06, + "loss": 0.842, + "num_input_tokens_seen": 618656, + "step": 955 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 6.143091678619385, + "learning_rate": 2.827240566037736e-06, + "loss": 0.6467, + "num_input_tokens_seen": 622528, + "step": 960 + }, + { + "epoch": 0.5689858490566038, + "grad_norm": 6.399939060211182, + "learning_rate": 2.841981132075472e-06, + "loss": 0.6852, + "num_input_tokens_seen": 625920, + "step": 965 + }, + { + "epoch": 0.5719339622641509, + "grad_norm": 5.964336395263672, + "learning_rate": 2.856721698113208e-06, + "loss": 0.6648, + "num_input_tokens_seen": 628544, + "step": 970 + }, + { + "epoch": 0.5748820754716981, + "grad_norm": 7.366810321807861, + "learning_rate": 2.8714622641509437e-06, + "loss": 0.7258, + "num_input_tokens_seen": 631072, + "step": 975 + }, + { + "epoch": 0.5778301886792453, + "grad_norm": 5.7462663650512695, + "learning_rate": 2.8862028301886794e-06, + "loss": 0.6933, + "num_input_tokens_seen": 633504, + "step": 980 + }, + { + "epoch": 0.5807783018867925, + "grad_norm": 10.469639778137207, + "learning_rate": 2.9009433962264156e-06, + "loss": 0.7531, + "num_input_tokens_seen": 636800, + "step": 985 + }, + { + "epoch": 0.5837264150943396, + "grad_norm": 4.1671223640441895, + "learning_rate": 2.9156839622641514e-06, + "loss": 0.5618, + "num_input_tokens_seen": 640160, + "step": 990 + }, + { + "epoch": 0.5866745283018868, + "grad_norm": 6.803442478179932, + "learning_rate": 2.930424528301887e-06, + "loss": 0.7435, + "num_input_tokens_seen": 643936, + "step": 995 + }, + { + "epoch": 0.589622641509434, + "grad_norm": 5.059441566467285, + "learning_rate": 2.9451650943396225e-06, + "loss": 0.5652, + "num_input_tokens_seen": 648384, + "step": 1000 + }, + { + "epoch": 0.5925707547169812, + "grad_norm": 5.3123650550842285, + "learning_rate": 2.9599056603773587e-06, + "loss": 0.6299, + "num_input_tokens_seen": 651168, + "step": 1005 + }, + { + "epoch": 0.5955188679245284, + "grad_norm": 3.5172204971313477, + "learning_rate": 2.9746462264150945e-06, + "loss": 0.7942, + "num_input_tokens_seen": 654272, + "step": 1010 + }, + { + "epoch": 0.5984669811320755, + "grad_norm": 4.174629211425781, + "learning_rate": 2.9893867924528302e-06, + "loss": 0.5136, + "num_input_tokens_seen": 657824, + "step": 1015 + }, + { + "epoch": 0.6014150943396226, + "grad_norm": 4.112761497497559, + "learning_rate": 3.004127358490566e-06, + "loss": 0.6794, + "num_input_tokens_seen": 661184, + "step": 1020 + }, + { + "epoch": 0.6043632075471698, + "grad_norm": 5.572396755218506, + "learning_rate": 3.018867924528302e-06, + "loss": 0.6936, + "num_input_tokens_seen": 665184, + "step": 1025 + }, + { + "epoch": 0.6073113207547169, + "grad_norm": 4.812779426574707, + "learning_rate": 3.033608490566038e-06, + "loss": 0.5847, + "num_input_tokens_seen": 667968, + "step": 1030 + }, + { + "epoch": 0.6102594339622641, + "grad_norm": 4.473247528076172, + "learning_rate": 3.0483490566037737e-06, + "loss": 0.5575, + "num_input_tokens_seen": 671360, + "step": 1035 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 5.123810768127441, + "learning_rate": 3.0630896226415095e-06, + "loss": 0.5811, + "num_input_tokens_seen": 674336, + "step": 1040 + }, + { + "epoch": 0.6161556603773585, + "grad_norm": 5.766445159912109, + "learning_rate": 3.0778301886792457e-06, + "loss": 0.5332, + "num_input_tokens_seen": 677504, + "step": 1045 + }, + { + "epoch": 0.6191037735849056, + "grad_norm": 9.049609184265137, + "learning_rate": 3.0925707547169815e-06, + "loss": 0.6762, + "num_input_tokens_seen": 680864, + "step": 1050 + }, + { + "epoch": 0.6220518867924528, + "grad_norm": 5.498325347900391, + "learning_rate": 3.1073113207547172e-06, + "loss": 0.6861, + "num_input_tokens_seen": 683776, + "step": 1055 + }, + { + "epoch": 0.625, + "grad_norm": 4.329188346862793, + "learning_rate": 3.122051886792453e-06, + "loss": 0.6801, + "num_input_tokens_seen": 686784, + "step": 1060 + }, + { + "epoch": 0.6279481132075472, + "grad_norm": 4.751805782318115, + "learning_rate": 3.136792452830189e-06, + "loss": 0.5865, + "num_input_tokens_seen": 690208, + "step": 1065 + }, + { + "epoch": 0.6308962264150944, + "grad_norm": 7.6678385734558105, + "learning_rate": 3.151533018867925e-06, + "loss": 0.7595, + "num_input_tokens_seen": 693664, + "step": 1070 + }, + { + "epoch": 0.6338443396226415, + "grad_norm": 8.4994478225708, + "learning_rate": 3.1662735849056607e-06, + "loss": 0.8149, + "num_input_tokens_seen": 697696, + "step": 1075 + }, + { + "epoch": 0.6367924528301887, + "grad_norm": 4.134837627410889, + "learning_rate": 3.181014150943397e-06, + "loss": 0.6523, + "num_input_tokens_seen": 700480, + "step": 1080 + }, + { + "epoch": 0.6397405660377359, + "grad_norm": 7.377701759338379, + "learning_rate": 3.1957547169811327e-06, + "loss": 0.6243, + "num_input_tokens_seen": 704096, + "step": 1085 + }, + { + "epoch": 0.6426886792452831, + "grad_norm": 8.720990180969238, + "learning_rate": 3.210495283018868e-06, + "loss": 0.6165, + "num_input_tokens_seen": 706720, + "step": 1090 + }, + { + "epoch": 0.6456367924528302, + "grad_norm": 10.208901405334473, + "learning_rate": 3.225235849056604e-06, + "loss": 0.7694, + "num_input_tokens_seen": 709536, + "step": 1095 + }, + { + "epoch": 0.6485849056603774, + "grad_norm": 5.441288948059082, + "learning_rate": 3.2399764150943396e-06, + "loss": 0.7433, + "num_input_tokens_seen": 713152, + "step": 1100 + }, + { + "epoch": 0.6515330188679245, + "grad_norm": 8.408998489379883, + "learning_rate": 3.2547169811320758e-06, + "loss": 0.7226, + "num_input_tokens_seen": 716160, + "step": 1105 + }, + { + "epoch": 0.6544811320754716, + "grad_norm": 8.885174751281738, + "learning_rate": 3.2694575471698115e-06, + "loss": 0.6081, + "num_input_tokens_seen": 719168, + "step": 1110 + }, + { + "epoch": 0.6574292452830188, + "grad_norm": 4.572237014770508, + "learning_rate": 3.2841981132075473e-06, + "loss": 0.5957, + "num_input_tokens_seen": 722944, + "step": 1115 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 13.368962287902832, + "learning_rate": 3.298938679245283e-06, + "loss": 0.7109, + "num_input_tokens_seen": 726912, + "step": 1120 + }, + { + "epoch": 0.6633254716981132, + "grad_norm": 11.320699691772461, + "learning_rate": 3.3136792452830192e-06, + "loss": 0.6351, + "num_input_tokens_seen": 730688, + "step": 1125 + }, + { + "epoch": 0.6662735849056604, + "grad_norm": 3.7924575805664062, + "learning_rate": 3.328419811320755e-06, + "loss": 0.7068, + "num_input_tokens_seen": 734656, + "step": 1130 + }, + { + "epoch": 0.6692216981132075, + "grad_norm": 12.168998718261719, + "learning_rate": 3.3431603773584908e-06, + "loss": 0.7861, + "num_input_tokens_seen": 737408, + "step": 1135 + }, + { + "epoch": 0.6721698113207547, + "grad_norm": 4.969852447509766, + "learning_rate": 3.3579009433962266e-06, + "loss": 0.5272, + "num_input_tokens_seen": 740224, + "step": 1140 + }, + { + "epoch": 0.6751179245283019, + "grad_norm": 3.422187089920044, + "learning_rate": 3.3726415094339627e-06, + "loss": 0.5752, + "num_input_tokens_seen": 742688, + "step": 1145 + }, + { + "epoch": 0.6780660377358491, + "grad_norm": 5.729838848114014, + "learning_rate": 3.3873820754716985e-06, + "loss": 0.6491, + "num_input_tokens_seen": 745728, + "step": 1150 + }, + { + "epoch": 0.6810141509433962, + "grad_norm": 4.363771438598633, + "learning_rate": 3.4021226415094343e-06, + "loss": 0.586, + "num_input_tokens_seen": 749984, + "step": 1155 + }, + { + "epoch": 0.6839622641509434, + "grad_norm": 2.963905096054077, + "learning_rate": 3.4168632075471705e-06, + "loss": 0.6162, + "num_input_tokens_seen": 753408, + "step": 1160 + }, + { + "epoch": 0.6869103773584906, + "grad_norm": 11.451322555541992, + "learning_rate": 3.4316037735849062e-06, + "loss": 0.6239, + "num_input_tokens_seen": 755936, + "step": 1165 + }, + { + "epoch": 0.6898584905660378, + "grad_norm": 4.222641468048096, + "learning_rate": 3.446344339622642e-06, + "loss": 0.6231, + "num_input_tokens_seen": 759264, + "step": 1170 + }, + { + "epoch": 0.6928066037735849, + "grad_norm": 3.43054461479187, + "learning_rate": 3.4610849056603778e-06, + "loss": 0.5357, + "num_input_tokens_seen": 762976, + "step": 1175 + }, + { + "epoch": 0.6957547169811321, + "grad_norm": 6.6390790939331055, + "learning_rate": 3.475825471698113e-06, + "loss": 0.5106, + "num_input_tokens_seen": 765888, + "step": 1180 + }, + { + "epoch": 0.6987028301886793, + "grad_norm": 2.2927517890930176, + "learning_rate": 3.4905660377358493e-06, + "loss": 0.6496, + "num_input_tokens_seen": 769120, + "step": 1185 + }, + { + "epoch": 0.7016509433962265, + "grad_norm": 2.9012715816497803, + "learning_rate": 3.505306603773585e-06, + "loss": 0.6433, + "num_input_tokens_seen": 772704, + "step": 1190 + }, + { + "epoch": 0.7045990566037735, + "grad_norm": 5.627623081207275, + "learning_rate": 3.520047169811321e-06, + "loss": 0.6652, + "num_input_tokens_seen": 775328, + "step": 1195 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 13.428121566772461, + "learning_rate": 3.5347877358490566e-06, + "loss": 0.6089, + "num_input_tokens_seen": 778048, + "step": 1200 + }, + { + "epoch": 0.7104952830188679, + "grad_norm": 4.02173376083374, + "learning_rate": 3.549528301886793e-06, + "loss": 0.6877, + "num_input_tokens_seen": 781792, + "step": 1205 + }, + { + "epoch": 0.7134433962264151, + "grad_norm": 9.602651596069336, + "learning_rate": 3.5642688679245286e-06, + "loss": 0.6718, + "num_input_tokens_seen": 784000, + "step": 1210 + }, + { + "epoch": 0.7163915094339622, + "grad_norm": 4.908822059631348, + "learning_rate": 3.5790094339622643e-06, + "loss": 0.7184, + "num_input_tokens_seen": 788256, + "step": 1215 + }, + { + "epoch": 0.7193396226415094, + "grad_norm": 4.420402526855469, + "learning_rate": 3.59375e-06, + "loss": 0.6878, + "num_input_tokens_seen": 791648, + "step": 1220 + }, + { + "epoch": 0.7222877358490566, + "grad_norm": 5.150833606719971, + "learning_rate": 3.6084905660377363e-06, + "loss": 0.5438, + "num_input_tokens_seen": 795872, + "step": 1225 + }, + { + "epoch": 0.7252358490566038, + "grad_norm": 7.396393299102783, + "learning_rate": 3.623231132075472e-06, + "loss": 0.6473, + "num_input_tokens_seen": 798752, + "step": 1230 + }, + { + "epoch": 0.7281839622641509, + "grad_norm": 4.693133354187012, + "learning_rate": 3.637971698113208e-06, + "loss": 0.6027, + "num_input_tokens_seen": 802048, + "step": 1235 + }, + { + "epoch": 0.7311320754716981, + "grad_norm": 5.008869647979736, + "learning_rate": 3.652712264150944e-06, + "loss": 0.65, + "num_input_tokens_seen": 804768, + "step": 1240 + }, + { + "epoch": 0.7340801886792453, + "grad_norm": 3.0386149883270264, + "learning_rate": 3.66745283018868e-06, + "loss": 0.6283, + "num_input_tokens_seen": 808896, + "step": 1245 + }, + { + "epoch": 0.7370283018867925, + "grad_norm": 5.325741291046143, + "learning_rate": 3.6821933962264156e-06, + "loss": 0.6092, + "num_input_tokens_seen": 812288, + "step": 1250 + }, + { + "epoch": 0.7399764150943396, + "grad_norm": 2.8650918006896973, + "learning_rate": 3.6969339622641513e-06, + "loss": 0.6638, + "num_input_tokens_seen": 815840, + "step": 1255 + }, + { + "epoch": 0.7429245283018868, + "grad_norm": 5.373302936553955, + "learning_rate": 3.7116745283018875e-06, + "loss": 0.5759, + "num_input_tokens_seen": 819168, + "step": 1260 + }, + { + "epoch": 0.745872641509434, + "grad_norm": 2.8748016357421875, + "learning_rate": 3.726415094339623e-06, + "loss": 0.4967, + "num_input_tokens_seen": 822944, + "step": 1265 + }, + { + "epoch": 0.7488207547169812, + "grad_norm": 4.558932781219482, + "learning_rate": 3.7411556603773586e-06, + "loss": 0.6567, + "num_input_tokens_seen": 826272, + "step": 1270 + }, + { + "epoch": 0.7517688679245284, + "grad_norm": 3.5304794311523438, + "learning_rate": 3.7558962264150944e-06, + "loss": 0.585, + "num_input_tokens_seen": 830144, + "step": 1275 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 8.17846393585205, + "learning_rate": 3.77063679245283e-06, + "loss": 0.6279, + "num_input_tokens_seen": 832960, + "step": 1280 + }, + { + "epoch": 0.7576650943396226, + "grad_norm": 9.533529281616211, + "learning_rate": 3.7853773584905664e-06, + "loss": 0.6032, + "num_input_tokens_seen": 835392, + "step": 1285 + }, + { + "epoch": 0.7606132075471698, + "grad_norm": 2.678600311279297, + "learning_rate": 3.800117924528302e-06, + "loss": 0.5209, + "num_input_tokens_seen": 838592, + "step": 1290 + }, + { + "epoch": 0.7635613207547169, + "grad_norm": 6.290689945220947, + "learning_rate": 3.814858490566038e-06, + "loss": 0.4959, + "num_input_tokens_seen": 841824, + "step": 1295 + }, + { + "epoch": 0.7665094339622641, + "grad_norm": 4.903485298156738, + "learning_rate": 3.829599056603774e-06, + "loss": 0.5725, + "num_input_tokens_seen": 845760, + "step": 1300 + }, + { + "epoch": 0.7694575471698113, + "grad_norm": 5.686278820037842, + "learning_rate": 3.8443396226415094e-06, + "loss": 0.8529, + "num_input_tokens_seen": 852288, + "step": 1305 + }, + { + "epoch": 0.7724056603773585, + "grad_norm": 8.209455490112305, + "learning_rate": 3.859080188679246e-06, + "loss": 0.553, + "num_input_tokens_seen": 855008, + "step": 1310 + }, + { + "epoch": 0.7753537735849056, + "grad_norm": 5.806557655334473, + "learning_rate": 3.873820754716982e-06, + "loss": 0.5266, + "num_input_tokens_seen": 857728, + "step": 1315 + }, + { + "epoch": 0.7783018867924528, + "grad_norm": 4.764500141143799, + "learning_rate": 3.888561320754717e-06, + "loss": 0.5665, + "num_input_tokens_seen": 861472, + "step": 1320 + }, + { + "epoch": 0.78125, + "grad_norm": 11.363874435424805, + "learning_rate": 3.903301886792453e-06, + "loss": 0.6286, + "num_input_tokens_seen": 865568, + "step": 1325 + }, + { + "epoch": 0.7841981132075472, + "grad_norm": 8.491943359375, + "learning_rate": 3.9180424528301895e-06, + "loss": 0.7196, + "num_input_tokens_seen": 868928, + "step": 1330 + }, + { + "epoch": 0.7871462264150944, + "grad_norm": 5.015326023101807, + "learning_rate": 3.932783018867925e-06, + "loss": 0.4917, + "num_input_tokens_seen": 872096, + "step": 1335 + }, + { + "epoch": 0.7900943396226415, + "grad_norm": 2.4574317932128906, + "learning_rate": 3.947523584905661e-06, + "loss": 0.434, + "num_input_tokens_seen": 875296, + "step": 1340 + }, + { + "epoch": 0.7930424528301887, + "grad_norm": 25.127607345581055, + "learning_rate": 3.962264150943396e-06, + "loss": 0.5318, + "num_input_tokens_seen": 878144, + "step": 1345 + }, + { + "epoch": 0.7959905660377359, + "grad_norm": 4.2509918212890625, + "learning_rate": 3.977004716981133e-06, + "loss": 0.7072, + "num_input_tokens_seen": 880928, + "step": 1350 + }, + { + "epoch": 0.7989386792452831, + "grad_norm": 8.605904579162598, + "learning_rate": 3.991745283018868e-06, + "loss": 0.5436, + "num_input_tokens_seen": 883680, + "step": 1355 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 6.206994533538818, + "learning_rate": 4.006485849056604e-06, + "loss": 0.7083, + "num_input_tokens_seen": 886368, + "step": 1360 + }, + { + "epoch": 0.8048349056603774, + "grad_norm": 4.296226978302002, + "learning_rate": 4.0212264150943395e-06, + "loss": 0.7309, + "num_input_tokens_seen": 889728, + "step": 1365 + }, + { + "epoch": 0.8077830188679245, + "grad_norm": 7.924245834350586, + "learning_rate": 4.035966981132076e-06, + "loss": 0.5954, + "num_input_tokens_seen": 892992, + "step": 1370 + }, + { + "epoch": 0.8107311320754716, + "grad_norm": 5.801962375640869, + "learning_rate": 4.050707547169812e-06, + "loss": 0.6435, + "num_input_tokens_seen": 895712, + "step": 1375 + }, + { + "epoch": 0.8136792452830188, + "grad_norm": 6.71798038482666, + "learning_rate": 4.065448113207547e-06, + "loss": 0.6819, + "num_input_tokens_seen": 898400, + "step": 1380 + }, + { + "epoch": 0.816627358490566, + "grad_norm": 4.68912410736084, + "learning_rate": 4.080188679245283e-06, + "loss": 0.5789, + "num_input_tokens_seen": 901440, + "step": 1385 + }, + { + "epoch": 0.8195754716981132, + "grad_norm": 11.836541175842285, + "learning_rate": 4.094929245283019e-06, + "loss": 0.6054, + "num_input_tokens_seen": 904288, + "step": 1390 + }, + { + "epoch": 0.8225235849056604, + "grad_norm": 5.365652084350586, + "learning_rate": 4.109669811320755e-06, + "loss": 0.5652, + "num_input_tokens_seen": 907776, + "step": 1395 + }, + { + "epoch": 0.8254716981132075, + "grad_norm": 6.0620293617248535, + "learning_rate": 4.124410377358491e-06, + "loss": 0.6495, + "num_input_tokens_seen": 911584, + "step": 1400 + }, + { + "epoch": 0.8284198113207547, + "grad_norm": 6.532092571258545, + "learning_rate": 4.1391509433962265e-06, + "loss": 0.5128, + "num_input_tokens_seen": 915232, + "step": 1405 + }, + { + "epoch": 0.8313679245283019, + "grad_norm": 5.426935195922852, + "learning_rate": 4.153891509433963e-06, + "loss": 0.5075, + "num_input_tokens_seen": 917696, + "step": 1410 + }, + { + "epoch": 0.8343160377358491, + "grad_norm": 12.016154289245605, + "learning_rate": 4.168632075471699e-06, + "loss": 0.7284, + "num_input_tokens_seen": 922656, + "step": 1415 + }, + { + "epoch": 0.8372641509433962, + "grad_norm": 7.776079177856445, + "learning_rate": 4.183372641509434e-06, + "loss": 0.4272, + "num_input_tokens_seen": 925856, + "step": 1420 + }, + { + "epoch": 0.8402122641509434, + "grad_norm": 3.587447166442871, + "learning_rate": 4.19811320754717e-06, + "loss": 0.661, + "num_input_tokens_seen": 930176, + "step": 1425 + }, + { + "epoch": 0.8431603773584906, + "grad_norm": 3.0893397331237793, + "learning_rate": 4.212853773584907e-06, + "loss": 0.6749, + "num_input_tokens_seen": 933472, + "step": 1430 + }, + { + "epoch": 0.8461084905660378, + "grad_norm": 8.176351547241211, + "learning_rate": 4.227594339622642e-06, + "loss": 0.5572, + "num_input_tokens_seen": 936416, + "step": 1435 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 3.689838171005249, + "learning_rate": 4.242334905660378e-06, + "loss": 0.6833, + "num_input_tokens_seen": 939136, + "step": 1440 + }, + { + "epoch": 0.8520047169811321, + "grad_norm": 5.29396390914917, + "learning_rate": 4.2570754716981135e-06, + "loss": 0.6223, + "num_input_tokens_seen": 943168, + "step": 1445 + }, + { + "epoch": 0.8549528301886793, + "grad_norm": 14.127381324768066, + "learning_rate": 4.271816037735849e-06, + "loss": 0.5195, + "num_input_tokens_seen": 945568, + "step": 1450 + }, + { + "epoch": 0.8579009433962265, + "grad_norm": 2.6478066444396973, + "learning_rate": 4.286556603773585e-06, + "loss": 0.5574, + "num_input_tokens_seen": 948416, + "step": 1455 + }, + { + "epoch": 0.8608490566037735, + "grad_norm": 2.951313018798828, + "learning_rate": 4.301297169811321e-06, + "loss": 0.5297, + "num_input_tokens_seen": 950880, + "step": 1460 + }, + { + "epoch": 0.8637971698113207, + "grad_norm": 7.67824125289917, + "learning_rate": 4.3160377358490565e-06, + "loss": 0.5256, + "num_input_tokens_seen": 953632, + "step": 1465 + }, + { + "epoch": 0.8667452830188679, + "grad_norm": 6.527291774749756, + "learning_rate": 4.330778301886793e-06, + "loss": 0.6593, + "num_input_tokens_seen": 956320, + "step": 1470 + }, + { + "epoch": 0.8696933962264151, + "grad_norm": 3.0069620609283447, + "learning_rate": 4.345518867924529e-06, + "loss": 0.6287, + "num_input_tokens_seen": 959712, + "step": 1475 + }, + { + "epoch": 0.8726415094339622, + "grad_norm": 3.1576664447784424, + "learning_rate": 4.360259433962264e-06, + "loss": 0.5644, + "num_input_tokens_seen": 963264, + "step": 1480 + }, + { + "epoch": 0.8755896226415094, + "grad_norm": 3.573331594467163, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.5753, + "num_input_tokens_seen": 967424, + "step": 1485 + }, + { + "epoch": 0.8785377358490566, + "grad_norm": 9.45398235321045, + "learning_rate": 4.389740566037737e-06, + "loss": 0.5454, + "num_input_tokens_seen": 974592, + "step": 1490 + }, + { + "epoch": 0.8814858490566038, + "grad_norm": 4.0027875900268555, + "learning_rate": 4.404481132075472e-06, + "loss": 0.6659, + "num_input_tokens_seen": 978784, + "step": 1495 + }, + { + "epoch": 0.8844339622641509, + "grad_norm": 7.140857219696045, + "learning_rate": 4.419221698113208e-06, + "loss": 0.5154, + "num_input_tokens_seen": 981696, + "step": 1500 + }, + { + "epoch": 0.8873820754716981, + "grad_norm": 11.862122535705566, + "learning_rate": 4.4339622641509435e-06, + "loss": 0.6376, + "num_input_tokens_seen": 985408, + "step": 1505 + }, + { + "epoch": 0.8903301886792453, + "grad_norm": 6.621129512786865, + "learning_rate": 4.44870283018868e-06, + "loss": 0.6862, + "num_input_tokens_seen": 988576, + "step": 1510 + }, + { + "epoch": 0.8932783018867925, + "grad_norm": 10.653403282165527, + "learning_rate": 4.463443396226416e-06, + "loss": 0.59, + "num_input_tokens_seen": 991840, + "step": 1515 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 5.599077224731445, + "learning_rate": 4.478183962264151e-06, + "loss": 0.6499, + "num_input_tokens_seen": 995200, + "step": 1520 + }, + { + "epoch": 0.8991745283018868, + "grad_norm": 4.309756278991699, + "learning_rate": 4.4929245283018875e-06, + "loss": 0.6689, + "num_input_tokens_seen": 998912, + "step": 1525 + }, + { + "epoch": 0.902122641509434, + "grad_norm": 6.143583297729492, + "learning_rate": 4.507665094339623e-06, + "loss": 0.5739, + "num_input_tokens_seen": 1002848, + "step": 1530 + }, + { + "epoch": 0.9050707547169812, + "grad_norm": 4.078437805175781, + "learning_rate": 4.522405660377359e-06, + "loss": 0.6967, + "num_input_tokens_seen": 1005312, + "step": 1535 + }, + { + "epoch": 0.9080188679245284, + "grad_norm": 3.9049553871154785, + "learning_rate": 4.537146226415094e-06, + "loss": 0.5949, + "num_input_tokens_seen": 1008576, + "step": 1540 + }, + { + "epoch": 0.9109669811320755, + "grad_norm": 2.814183473587036, + "learning_rate": 4.5518867924528305e-06, + "loss": 0.6176, + "num_input_tokens_seen": 1012736, + "step": 1545 + }, + { + "epoch": 0.9139150943396226, + "grad_norm": 3.7803328037261963, + "learning_rate": 4.566627358490566e-06, + "loss": 0.5534, + "num_input_tokens_seen": 1016320, + "step": 1550 + }, + { + "epoch": 0.9168632075471698, + "grad_norm": 4.066215515136719, + "learning_rate": 4.581367924528302e-06, + "loss": 0.6508, + "num_input_tokens_seen": 1019392, + "step": 1555 + }, + { + "epoch": 0.9198113207547169, + "grad_norm": 4.001386642456055, + "learning_rate": 4.596108490566038e-06, + "loss": 0.4678, + "num_input_tokens_seen": 1022080, + "step": 1560 + }, + { + "epoch": 0.9227594339622641, + "grad_norm": 4.080419063568115, + "learning_rate": 4.610849056603774e-06, + "loss": 0.5864, + "num_input_tokens_seen": 1024960, + "step": 1565 + }, + { + "epoch": 0.9257075471698113, + "grad_norm": 3.7366015911102295, + "learning_rate": 4.62558962264151e-06, + "loss": 0.6301, + "num_input_tokens_seen": 1028480, + "step": 1570 + }, + { + "epoch": 0.9286556603773585, + "grad_norm": 5.948261260986328, + "learning_rate": 4.640330188679246e-06, + "loss": 0.5089, + "num_input_tokens_seen": 1031328, + "step": 1575 + }, + { + "epoch": 0.9316037735849056, + "grad_norm": 4.513754367828369, + "learning_rate": 4.655070754716981e-06, + "loss": 0.4194, + "num_input_tokens_seen": 1034048, + "step": 1580 + }, + { + "epoch": 0.9345518867924528, + "grad_norm": 3.3506526947021484, + "learning_rate": 4.6698113207547175e-06, + "loss": 0.5023, + "num_input_tokens_seen": 1037024, + "step": 1585 + }, + { + "epoch": 0.9375, + "grad_norm": 4.534147262573242, + "learning_rate": 4.684551886792454e-06, + "loss": 0.5958, + "num_input_tokens_seen": 1040128, + "step": 1590 + }, + { + "epoch": 0.9404481132075472, + "grad_norm": 4.203045845031738, + "learning_rate": 4.699292452830189e-06, + "loss": 0.5879, + "num_input_tokens_seen": 1043328, + "step": 1595 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 4.055345058441162, + "learning_rate": 4.714033018867925e-06, + "loss": 0.5938, + "num_input_tokens_seen": 1046176, + "step": 1600 + }, + { + "epoch": 0.9463443396226415, + "grad_norm": 2.8795461654663086, + "learning_rate": 4.728773584905661e-06, + "loss": 0.6332, + "num_input_tokens_seen": 1048896, + "step": 1605 + }, + { + "epoch": 0.9492924528301887, + "grad_norm": 2.4212327003479004, + "learning_rate": 4.743514150943397e-06, + "loss": 0.6347, + "num_input_tokens_seen": 1052768, + "step": 1610 + }, + { + "epoch": 0.9522405660377359, + "grad_norm": 10.593609809875488, + "learning_rate": 4.758254716981133e-06, + "loss": 0.5877, + "num_input_tokens_seen": 1055616, + "step": 1615 + }, + { + "epoch": 0.9551886792452831, + "grad_norm": 2.602233648300171, + "learning_rate": 4.772995283018868e-06, + "loss": 0.5421, + "num_input_tokens_seen": 1059072, + "step": 1620 + }, + { + "epoch": 0.9581367924528302, + "grad_norm": 4.015645503997803, + "learning_rate": 4.787735849056604e-06, + "loss": 0.7592, + "num_input_tokens_seen": 1061920, + "step": 1625 + }, + { + "epoch": 0.9610849056603774, + "grad_norm": 7.811851501464844, + "learning_rate": 4.80247641509434e-06, + "loss": 0.581, + "num_input_tokens_seen": 1064896, + "step": 1630 + }, + { + "epoch": 0.9640330188679245, + "grad_norm": 3.367180585861206, + "learning_rate": 4.817216981132076e-06, + "loss": 0.6342, + "num_input_tokens_seen": 1067744, + "step": 1635 + }, + { + "epoch": 0.9669811320754716, + "grad_norm": 2.3358185291290283, + "learning_rate": 4.831957547169811e-06, + "loss": 0.5076, + "num_input_tokens_seen": 1070944, + "step": 1640 + }, + { + "epoch": 0.9699292452830188, + "grad_norm": 5.156381607055664, + "learning_rate": 4.8466981132075476e-06, + "loss": 0.5585, + "num_input_tokens_seen": 1073888, + "step": 1645 + }, + { + "epoch": 0.972877358490566, + "grad_norm": 9.319182395935059, + "learning_rate": 4.861438679245283e-06, + "loss": 0.8427, + "num_input_tokens_seen": 1078592, + "step": 1650 + }, + { + "epoch": 0.9758254716981132, + "grad_norm": 4.825729846954346, + "learning_rate": 4.876179245283019e-06, + "loss": 0.5229, + "num_input_tokens_seen": 1081920, + "step": 1655 + }, + { + "epoch": 0.9787735849056604, + "grad_norm": 5.914137363433838, + "learning_rate": 4.890919811320755e-06, + "loss": 0.6266, + "num_input_tokens_seen": 1084800, + "step": 1660 + }, + { + "epoch": 0.9817216981132075, + "grad_norm": 4.3888139724731445, + "learning_rate": 4.905660377358491e-06, + "loss": 0.5046, + "num_input_tokens_seen": 1088832, + "step": 1665 + }, + { + "epoch": 0.9846698113207547, + "grad_norm": 3.9114036560058594, + "learning_rate": 4.920400943396227e-06, + "loss": 0.8285, + "num_input_tokens_seen": 1092608, + "step": 1670 + }, + { + "epoch": 0.9876179245283019, + "grad_norm": 3.129378318786621, + "learning_rate": 4.935141509433963e-06, + "loss": 0.5754, + "num_input_tokens_seen": 1095776, + "step": 1675 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 1.8735584020614624, + "learning_rate": 4.949882075471698e-06, + "loss": 0.4921, + "num_input_tokens_seen": 1098816, + "step": 1680 + }, + { + "epoch": 0.9935141509433962, + "grad_norm": 2.893059253692627, + "learning_rate": 4.9646226415094346e-06, + "loss": 0.5883, + "num_input_tokens_seen": 1101664, + "step": 1685 + }, + { + "epoch": 0.9964622641509434, + "grad_norm": 4.612443923950195, + "learning_rate": 4.979363207547171e-06, + "loss": 0.5248, + "num_input_tokens_seen": 1105504, + "step": 1690 + }, + { + "epoch": 0.9994103773584906, + "grad_norm": 2.739424467086792, + "learning_rate": 4.994103773584906e-06, + "loss": 0.6374, + "num_input_tokens_seen": 1109088, + "step": 1695 + }, + { + "epoch": 1.0023584905660377, + "grad_norm": 2.9989612102508545, + "learning_rate": 5.0088443396226414e-06, + "loss": 0.4929, + "num_input_tokens_seen": 1111592, + "step": 1700 + }, + { + "epoch": 1.005306603773585, + "grad_norm": 3.7178943157196045, + "learning_rate": 5.023584905660378e-06, + "loss": 0.6107, + "num_input_tokens_seen": 1115016, + "step": 1705 + }, + { + "epoch": 1.008254716981132, + "grad_norm": 6.758875370025635, + "learning_rate": 5.038325471698113e-06, + "loss": 0.5915, + "num_input_tokens_seen": 1118248, + "step": 1710 + }, + { + "epoch": 1.0112028301886793, + "grad_norm": 12.016427993774414, + "learning_rate": 5.05306603773585e-06, + "loss": 0.5639, + "num_input_tokens_seen": 1122312, + "step": 1715 + }, + { + "epoch": 1.0141509433962264, + "grad_norm": 3.3121728897094727, + "learning_rate": 5.067806603773585e-06, + "loss": 0.5197, + "num_input_tokens_seen": 1125736, + "step": 1720 + }, + { + "epoch": 1.0170990566037736, + "grad_norm": 6.567498683929443, + "learning_rate": 5.0825471698113216e-06, + "loss": 0.4926, + "num_input_tokens_seen": 1128040, + "step": 1725 + }, + { + "epoch": 1.0200471698113207, + "grad_norm": 3.436720609664917, + "learning_rate": 5.097287735849057e-06, + "loss": 0.4927, + "num_input_tokens_seen": 1131176, + "step": 1730 + }, + { + "epoch": 1.022995283018868, + "grad_norm": 2.521578550338745, + "learning_rate": 5.112028301886793e-06, + "loss": 0.5068, + "num_input_tokens_seen": 1134504, + "step": 1735 + }, + { + "epoch": 1.025943396226415, + "grad_norm": 3.6427536010742188, + "learning_rate": 5.1267688679245284e-06, + "loss": 0.4646, + "num_input_tokens_seen": 1137224, + "step": 1740 + }, + { + "epoch": 1.0288915094339623, + "grad_norm": 5.974146366119385, + "learning_rate": 5.1415094339622655e-06, + "loss": 0.5454, + "num_input_tokens_seen": 1141160, + "step": 1745 + }, + { + "epoch": 1.0318396226415094, + "grad_norm": 2.740487813949585, + "learning_rate": 5.156250000000001e-06, + "loss": 0.4068, + "num_input_tokens_seen": 1144840, + "step": 1750 + }, + { + "epoch": 1.0347877358490567, + "grad_norm": 11.60596752166748, + "learning_rate": 5.170990566037736e-06, + "loss": 0.7379, + "num_input_tokens_seen": 1147688, + "step": 1755 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 3.635998010635376, + "learning_rate": 5.185731132075472e-06, + "loss": 0.6728, + "num_input_tokens_seen": 1151144, + "step": 1760 + }, + { + "epoch": 1.040683962264151, + "grad_norm": 4.0117034912109375, + "learning_rate": 5.200471698113208e-06, + "loss": 0.5028, + "num_input_tokens_seen": 1153576, + "step": 1765 + }, + { + "epoch": 1.0436320754716981, + "grad_norm": 5.33103084564209, + "learning_rate": 5.215212264150944e-06, + "loss": 0.606, + "num_input_tokens_seen": 1156520, + "step": 1770 + }, + { + "epoch": 1.0465801886792452, + "grad_norm": 6.539937973022461, + "learning_rate": 5.229952830188679e-06, + "loss": 0.727, + "num_input_tokens_seen": 1159112, + "step": 1775 + }, + { + "epoch": 1.0495283018867925, + "grad_norm": 4.715179920196533, + "learning_rate": 5.2446933962264154e-06, + "loss": 0.6011, + "num_input_tokens_seen": 1162696, + "step": 1780 + }, + { + "epoch": 1.0524764150943395, + "grad_norm": 3.8241524696350098, + "learning_rate": 5.259433962264151e-06, + "loss": 0.4691, + "num_input_tokens_seen": 1166216, + "step": 1785 + }, + { + "epoch": 1.0554245283018868, + "grad_norm": 3.14703631401062, + "learning_rate": 5.274174528301888e-06, + "loss": 0.6155, + "num_input_tokens_seen": 1169704, + "step": 1790 + }, + { + "epoch": 1.0583726415094339, + "grad_norm": 4.975287437438965, + "learning_rate": 5.288915094339623e-06, + "loss": 0.5914, + "num_input_tokens_seen": 1172968, + "step": 1795 + }, + { + "epoch": 1.0613207547169812, + "grad_norm": 4.610471248626709, + "learning_rate": 5.303655660377359e-06, + "loss": 0.6862, + "num_input_tokens_seen": 1176872, + "step": 1800 + }, + { + "epoch": 1.0642688679245282, + "grad_norm": 6.072854042053223, + "learning_rate": 5.318396226415095e-06, + "loss": 0.4426, + "num_input_tokens_seen": 1179304, + "step": 1805 + }, + { + "epoch": 1.0672169811320755, + "grad_norm": 3.9580442905426025, + "learning_rate": 5.333136792452831e-06, + "loss": 0.4884, + "num_input_tokens_seen": 1182408, + "step": 1810 + }, + { + "epoch": 1.0701650943396226, + "grad_norm": 5.04810905456543, + "learning_rate": 5.347877358490566e-06, + "loss": 0.4798, + "num_input_tokens_seen": 1185384, + "step": 1815 + }, + { + "epoch": 1.0731132075471699, + "grad_norm": 3.018094778060913, + "learning_rate": 5.362617924528302e-06, + "loss": 0.6003, + "num_input_tokens_seen": 1188392, + "step": 1820 + }, + { + "epoch": 1.076061320754717, + "grad_norm": 4.425589561462402, + "learning_rate": 5.377358490566038e-06, + "loss": 0.5613, + "num_input_tokens_seen": 1192264, + "step": 1825 + }, + { + "epoch": 1.0790094339622642, + "grad_norm": 3.30791974067688, + "learning_rate": 5.392099056603775e-06, + "loss": 0.5039, + "num_input_tokens_seen": 1195592, + "step": 1830 + }, + { + "epoch": 1.0819575471698113, + "grad_norm": 5.349789142608643, + "learning_rate": 5.40683962264151e-06, + "loss": 0.7268, + "num_input_tokens_seen": 1198216, + "step": 1835 + }, + { + "epoch": 1.0849056603773586, + "grad_norm": 4.947377681732178, + "learning_rate": 5.4215801886792455e-06, + "loss": 0.491, + "num_input_tokens_seen": 1201224, + "step": 1840 + }, + { + "epoch": 1.0878537735849056, + "grad_norm": 3.6661951541900635, + "learning_rate": 5.436320754716982e-06, + "loss": 0.5731, + "num_input_tokens_seen": 1204232, + "step": 1845 + }, + { + "epoch": 1.0908018867924527, + "grad_norm": 5.3298258781433105, + "learning_rate": 5.451061320754717e-06, + "loss": 0.611, + "num_input_tokens_seen": 1207624, + "step": 1850 + }, + { + "epoch": 1.09375, + "grad_norm": 5.922006130218506, + "learning_rate": 5.465801886792453e-06, + "loss": 0.5949, + "num_input_tokens_seen": 1210664, + "step": 1855 + }, + { + "epoch": 1.0966981132075473, + "grad_norm": 2.9588658809661865, + "learning_rate": 5.4805424528301886e-06, + "loss": 0.71, + "num_input_tokens_seen": 1214024, + "step": 1860 + }, + { + "epoch": 1.0996462264150944, + "grad_norm": 2.568260431289673, + "learning_rate": 5.495283018867925e-06, + "loss": 0.3621, + "num_input_tokens_seen": 1217192, + "step": 1865 + }, + { + "epoch": 1.1025943396226414, + "grad_norm": 2.226083517074585, + "learning_rate": 5.51002358490566e-06, + "loss": 0.5362, + "num_input_tokens_seen": 1219912, + "step": 1870 + }, + { + "epoch": 1.1055424528301887, + "grad_norm": 4.561788082122803, + "learning_rate": 5.524764150943397e-06, + "loss": 0.5, + "num_input_tokens_seen": 1222184, + "step": 1875 + }, + { + "epoch": 1.1084905660377358, + "grad_norm": 7.513849258422852, + "learning_rate": 5.5395047169811325e-06, + "loss": 0.7646, + "num_input_tokens_seen": 1224680, + "step": 1880 + }, + { + "epoch": 1.111438679245283, + "grad_norm": 4.5633978843688965, + "learning_rate": 5.554245283018869e-06, + "loss": 0.7365, + "num_input_tokens_seen": 1228104, + "step": 1885 + }, + { + "epoch": 1.1143867924528301, + "grad_norm": 4.611073970794678, + "learning_rate": 5.568985849056604e-06, + "loss": 0.5355, + "num_input_tokens_seen": 1231528, + "step": 1890 + }, + { + "epoch": 1.1173349056603774, + "grad_norm": 6.0877604484558105, + "learning_rate": 5.58372641509434e-06, + "loss": 0.494, + "num_input_tokens_seen": 1234728, + "step": 1895 + }, + { + "epoch": 1.1202830188679245, + "grad_norm": 3.8698697090148926, + "learning_rate": 5.5984669811320755e-06, + "loss": 0.5302, + "num_input_tokens_seen": 1237800, + "step": 1900 + }, + { + "epoch": 1.1232311320754718, + "grad_norm": 15.551494598388672, + "learning_rate": 5.613207547169813e-06, + "loss": 0.4128, + "num_input_tokens_seen": 1240744, + "step": 1905 + }, + { + "epoch": 1.1261792452830188, + "grad_norm": 2.462249279022217, + "learning_rate": 5.627948113207548e-06, + "loss": 0.5518, + "num_input_tokens_seen": 1245608, + "step": 1910 + }, + { + "epoch": 1.1291273584905661, + "grad_norm": 3.139821767807007, + "learning_rate": 5.642688679245284e-06, + "loss": 0.4392, + "num_input_tokens_seen": 1248456, + "step": 1915 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 2.583340644836426, + "learning_rate": 5.6574292452830195e-06, + "loss": 0.5152, + "num_input_tokens_seen": 1252232, + "step": 1920 + }, + { + "epoch": 1.1350235849056605, + "grad_norm": 4.024645805358887, + "learning_rate": 5.672169811320756e-06, + "loss": 0.5063, + "num_input_tokens_seen": 1255048, + "step": 1925 + }, + { + "epoch": 1.1379716981132075, + "grad_norm": 3.1879117488861084, + "learning_rate": 5.686910377358491e-06, + "loss": 0.5155, + "num_input_tokens_seen": 1258664, + "step": 1930 + }, + { + "epoch": 1.1409198113207548, + "grad_norm": 3.5429654121398926, + "learning_rate": 5.701650943396226e-06, + "loss": 0.6073, + "num_input_tokens_seen": 1261576, + "step": 1935 + }, + { + "epoch": 1.1438679245283019, + "grad_norm": 2.05853009223938, + "learning_rate": 5.7163915094339625e-06, + "loss": 0.4653, + "num_input_tokens_seen": 1264392, + "step": 1940 + }, + { + "epoch": 1.146816037735849, + "grad_norm": 2.876725435256958, + "learning_rate": 5.731132075471698e-06, + "loss": 0.5307, + "num_input_tokens_seen": 1267720, + "step": 1945 + }, + { + "epoch": 1.1497641509433962, + "grad_norm": 3.2926602363586426, + "learning_rate": 5.745872641509435e-06, + "loss": 0.4903, + "num_input_tokens_seen": 1270600, + "step": 1950 + }, + { + "epoch": 1.1527122641509433, + "grad_norm": 4.320738315582275, + "learning_rate": 5.76061320754717e-06, + "loss": 0.7938, + "num_input_tokens_seen": 1274312, + "step": 1955 + }, + { + "epoch": 1.1556603773584906, + "grad_norm": 3.644739866256714, + "learning_rate": 5.7753537735849065e-06, + "loss": 0.577, + "num_input_tokens_seen": 1278088, + "step": 1960 + }, + { + "epoch": 1.1586084905660377, + "grad_norm": 2.6592800617218018, + "learning_rate": 5.790094339622642e-06, + "loss": 0.5244, + "num_input_tokens_seen": 1281160, + "step": 1965 + }, + { + "epoch": 1.161556603773585, + "grad_norm": 2.3588733673095703, + "learning_rate": 5.804834905660378e-06, + "loss": 0.5503, + "num_input_tokens_seen": 1284840, + "step": 1970 + }, + { + "epoch": 1.164504716981132, + "grad_norm": 3.7015442848205566, + "learning_rate": 5.819575471698113e-06, + "loss": 0.5648, + "num_input_tokens_seen": 1288232, + "step": 1975 + }, + { + "epoch": 1.1674528301886793, + "grad_norm": 3.35829758644104, + "learning_rate": 5.8343160377358495e-06, + "loss": 0.6383, + "num_input_tokens_seen": 1291464, + "step": 1980 + }, + { + "epoch": 1.1704009433962264, + "grad_norm": 7.999370574951172, + "learning_rate": 5.849056603773585e-06, + "loss": 0.5669, + "num_input_tokens_seen": 1295496, + "step": 1985 + }, + { + "epoch": 1.1733490566037736, + "grad_norm": 9.482475280761719, + "learning_rate": 5.863797169811322e-06, + "loss": 0.5102, + "num_input_tokens_seen": 1298248, + "step": 1990 + }, + { + "epoch": 1.1762971698113207, + "grad_norm": 16.183162689208984, + "learning_rate": 5.878537735849057e-06, + "loss": 0.555, + "num_input_tokens_seen": 1301416, + "step": 1995 + }, + { + "epoch": 1.179245283018868, + "grad_norm": 4.1765522956848145, + "learning_rate": 5.8932783018867934e-06, + "loss": 0.5001, + "num_input_tokens_seen": 1304168, + "step": 2000 + }, + { + "epoch": 1.182193396226415, + "grad_norm": 2.824888229370117, + "learning_rate": 5.908018867924529e-06, + "loss": 0.4784, + "num_input_tokens_seen": 1309928, + "step": 2005 + }, + { + "epoch": 1.1851415094339623, + "grad_norm": 3.2439990043640137, + "learning_rate": 5.922759433962265e-06, + "loss": 0.3776, + "num_input_tokens_seen": 1312424, + "step": 2010 + }, + { + "epoch": 1.1880896226415094, + "grad_norm": 4.059011936187744, + "learning_rate": 5.9375e-06, + "loss": 0.6335, + "num_input_tokens_seen": 1315272, + "step": 2015 + }, + { + "epoch": 1.1910377358490567, + "grad_norm": 3.7251877784729004, + "learning_rate": 5.952240566037736e-06, + "loss": 0.4161, + "num_input_tokens_seen": 1318632, + "step": 2020 + }, + { + "epoch": 1.1939858490566038, + "grad_norm": 4.338551044464111, + "learning_rate": 5.966981132075472e-06, + "loss": 0.6086, + "num_input_tokens_seen": 1322504, + "step": 2025 + }, + { + "epoch": 1.196933962264151, + "grad_norm": 23.651634216308594, + "learning_rate": 5.981721698113207e-06, + "loss": 0.6115, + "num_input_tokens_seen": 1326184, + "step": 2030 + }, + { + "epoch": 1.1998820754716981, + "grad_norm": 4.28742790222168, + "learning_rate": 5.996462264150944e-06, + "loss": 0.5803, + "num_input_tokens_seen": 1329128, + "step": 2035 + }, + { + "epoch": 1.2028301886792452, + "grad_norm": 4.192485809326172, + "learning_rate": 6.01120283018868e-06, + "loss": 0.3544, + "num_input_tokens_seen": 1334568, + "step": 2040 + }, + { + "epoch": 1.2057783018867925, + "grad_norm": 3.5532679557800293, + "learning_rate": 6.025943396226416e-06, + "loss": 0.6456, + "num_input_tokens_seen": 1338024, + "step": 2045 + }, + { + "epoch": 1.2087264150943395, + "grad_norm": 2.531177520751953, + "learning_rate": 6.040683962264151e-06, + "loss": 0.401, + "num_input_tokens_seen": 1341672, + "step": 2050 + }, + { + "epoch": 1.2116745283018868, + "grad_norm": 8.4788236618042, + "learning_rate": 6.055424528301887e-06, + "loss": 0.5664, + "num_input_tokens_seen": 1344072, + "step": 2055 + }, + { + "epoch": 1.2146226415094339, + "grad_norm": 3.6985864639282227, + "learning_rate": 6.070165094339623e-06, + "loss": 0.5289, + "num_input_tokens_seen": 1349256, + "step": 2060 + }, + { + "epoch": 1.2175707547169812, + "grad_norm": 18.42817497253418, + "learning_rate": 6.08490566037736e-06, + "loss": 0.5231, + "num_input_tokens_seen": 1351464, + "step": 2065 + }, + { + "epoch": 1.2205188679245282, + "grad_norm": 2.877884864807129, + "learning_rate": 6.099646226415095e-06, + "loss": 0.6018, + "num_input_tokens_seen": 1354664, + "step": 2070 + }, + { + "epoch": 1.2234669811320755, + "grad_norm": 4.898430824279785, + "learning_rate": 6.114386792452831e-06, + "loss": 0.5997, + "num_input_tokens_seen": 1358856, + "step": 2075 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 4.283647060394287, + "learning_rate": 6.129127358490567e-06, + "loss": 0.4314, + "num_input_tokens_seen": 1361320, + "step": 2080 + }, + { + "epoch": 1.2293632075471699, + "grad_norm": 4.192919731140137, + "learning_rate": 6.143867924528303e-06, + "loss": 0.6109, + "num_input_tokens_seen": 1364808, + "step": 2085 + }, + { + "epoch": 1.232311320754717, + "grad_norm": 4.79982852935791, + "learning_rate": 6.158608490566038e-06, + "loss": 0.6597, + "num_input_tokens_seen": 1367912, + "step": 2090 + }, + { + "epoch": 1.2352594339622642, + "grad_norm": 3.2680094242095947, + "learning_rate": 6.173349056603774e-06, + "loss": 0.4178, + "num_input_tokens_seen": 1371432, + "step": 2095 + }, + { + "epoch": 1.2382075471698113, + "grad_norm": 4.653439998626709, + "learning_rate": 6.18808962264151e-06, + "loss": 0.7139, + "num_input_tokens_seen": 1373928, + "step": 2100 + }, + { + "epoch": 1.2411556603773586, + "grad_norm": 6.5873517990112305, + "learning_rate": 6.202830188679245e-06, + "loss": 0.5577, + "num_input_tokens_seen": 1377800, + "step": 2105 + }, + { + "epoch": 1.2441037735849056, + "grad_norm": 3.0361716747283936, + "learning_rate": 6.217570754716982e-06, + "loss": 0.4557, + "num_input_tokens_seen": 1381064, + "step": 2110 + }, + { + "epoch": 1.2470518867924527, + "grad_norm": 3.097989559173584, + "learning_rate": 6.232311320754717e-06, + "loss": 0.6226, + "num_input_tokens_seen": 1384232, + "step": 2115 + }, + { + "epoch": 1.25, + "grad_norm": 4.255221366882324, + "learning_rate": 6.2470518867924536e-06, + "loss": 0.518, + "num_input_tokens_seen": 1387240, + "step": 2120 + }, + { + "epoch": 1.2529481132075473, + "grad_norm": 3.4855329990386963, + "learning_rate": 6.261792452830189e-06, + "loss": 0.6236, + "num_input_tokens_seen": 1390248, + "step": 2125 + }, + { + "epoch": 1.2558962264150944, + "grad_norm": 2.9747116565704346, + "learning_rate": 6.276533018867925e-06, + "loss": 0.4841, + "num_input_tokens_seen": 1393768, + "step": 2130 + }, + { + "epoch": 1.2588443396226414, + "grad_norm": 2.1630825996398926, + "learning_rate": 6.2912735849056604e-06, + "loss": 0.5816, + "num_input_tokens_seen": 1396808, + "step": 2135 + }, + { + "epoch": 1.2617924528301887, + "grad_norm": 2.1857645511627197, + "learning_rate": 6.306014150943397e-06, + "loss": 0.5023, + "num_input_tokens_seen": 1399656, + "step": 2140 + }, + { + "epoch": 1.2647405660377358, + "grad_norm": 3.070129156112671, + "learning_rate": 6.320754716981132e-06, + "loss": 0.506, + "num_input_tokens_seen": 1402792, + "step": 2145 + }, + { + "epoch": 1.267688679245283, + "grad_norm": 3.632359266281128, + "learning_rate": 6.335495283018869e-06, + "loss": 0.5521, + "num_input_tokens_seen": 1405160, + "step": 2150 + }, + { + "epoch": 1.2706367924528301, + "grad_norm": 5.355973243713379, + "learning_rate": 6.350235849056604e-06, + "loss": 0.5534, + "num_input_tokens_seen": 1409512, + "step": 2155 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 3.930882692337036, + "learning_rate": 6.3649764150943406e-06, + "loss": 0.4787, + "num_input_tokens_seen": 1413672, + "step": 2160 + }, + { + "epoch": 1.2765330188679245, + "grad_norm": 5.208805084228516, + "learning_rate": 6.379716981132076e-06, + "loss": 0.6297, + "num_input_tokens_seen": 1416968, + "step": 2165 + }, + { + "epoch": 1.2794811320754718, + "grad_norm": 9.071327209472656, + "learning_rate": 6.394457547169812e-06, + "loss": 0.5866, + "num_input_tokens_seen": 1419208, + "step": 2170 + }, + { + "epoch": 1.2824292452830188, + "grad_norm": 4.71073579788208, + "learning_rate": 6.4091981132075474e-06, + "loss": 0.5419, + "num_input_tokens_seen": 1422728, + "step": 2175 + }, + { + "epoch": 1.2853773584905661, + "grad_norm": 2.9785704612731934, + "learning_rate": 6.423938679245284e-06, + "loss": 0.5076, + "num_input_tokens_seen": 1426568, + "step": 2180 + }, + { + "epoch": 1.2883254716981132, + "grad_norm": 3.893916130065918, + "learning_rate": 6.438679245283019e-06, + "loss": 0.6449, + "num_input_tokens_seen": 1430088, + "step": 2185 + }, + { + "epoch": 1.2912735849056602, + "grad_norm": 4.394443035125732, + "learning_rate": 6.453419811320756e-06, + "loss": 0.6354, + "num_input_tokens_seen": 1432424, + "step": 2190 + }, + { + "epoch": 1.2942216981132075, + "grad_norm": 2.975356340408325, + "learning_rate": 6.468160377358491e-06, + "loss": 0.5598, + "num_input_tokens_seen": 1436488, + "step": 2195 + }, + { + "epoch": 1.2971698113207548, + "grad_norm": 12.325146675109863, + "learning_rate": 6.482900943396227e-06, + "loss": 0.4876, + "num_input_tokens_seen": 1439848, + "step": 2200 + }, + { + "epoch": 1.3001179245283019, + "grad_norm": 3.521345615386963, + "learning_rate": 6.497641509433963e-06, + "loss": 0.463, + "num_input_tokens_seen": 1443176, + "step": 2205 + }, + { + "epoch": 1.303066037735849, + "grad_norm": 4.2001872062683105, + "learning_rate": 6.512382075471698e-06, + "loss": 0.5403, + "num_input_tokens_seen": 1451400, + "step": 2210 + }, + { + "epoch": 1.3060141509433962, + "grad_norm": 10.69773006439209, + "learning_rate": 6.5271226415094344e-06, + "loss": 0.7305, + "num_input_tokens_seen": 1454280, + "step": 2215 + }, + { + "epoch": 1.3089622641509435, + "grad_norm": 3.4736387729644775, + "learning_rate": 6.54186320754717e-06, + "loss": 0.5723, + "num_input_tokens_seen": 1456904, + "step": 2220 + }, + { + "epoch": 1.3119103773584906, + "grad_norm": 3.9165291786193848, + "learning_rate": 6.556603773584907e-06, + "loss": 0.4412, + "num_input_tokens_seen": 1460712, + "step": 2225 + }, + { + "epoch": 1.3148584905660377, + "grad_norm": 3.57822847366333, + "learning_rate": 6.571344339622641e-06, + "loss": 0.525, + "num_input_tokens_seen": 1464616, + "step": 2230 + }, + { + "epoch": 1.317806603773585, + "grad_norm": 2.1331441402435303, + "learning_rate": 6.586084905660378e-06, + "loss": 0.4586, + "num_input_tokens_seen": 1468584, + "step": 2235 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 2.6206865310668945, + "learning_rate": 6.600825471698114e-06, + "loss": 0.4612, + "num_input_tokens_seen": 1472072, + "step": 2240 + }, + { + "epoch": 1.3237028301886793, + "grad_norm": 3.5069103240966797, + "learning_rate": 6.61556603773585e-06, + "loss": 0.5793, + "num_input_tokens_seen": 1477928, + "step": 2245 + }, + { + "epoch": 1.3266509433962264, + "grad_norm": 2.6483612060546875, + "learning_rate": 6.630306603773585e-06, + "loss": 0.5162, + "num_input_tokens_seen": 1481032, + "step": 2250 + }, + { + "epoch": 1.3295990566037736, + "grad_norm": 3.1596710681915283, + "learning_rate": 6.645047169811321e-06, + "loss": 0.7315, + "num_input_tokens_seen": 1484232, + "step": 2255 + }, + { + "epoch": 1.3325471698113207, + "grad_norm": 5.666794300079346, + "learning_rate": 6.659787735849057e-06, + "loss": 0.5708, + "num_input_tokens_seen": 1487592, + "step": 2260 + }, + { + "epoch": 1.335495283018868, + "grad_norm": 3.1301307678222656, + "learning_rate": 6.674528301886794e-06, + "loss": 0.4436, + "num_input_tokens_seen": 1490344, + "step": 2265 + }, + { + "epoch": 1.338443396226415, + "grad_norm": 3.248842239379883, + "learning_rate": 6.689268867924529e-06, + "loss": 0.3889, + "num_input_tokens_seen": 1493288, + "step": 2270 + }, + { + "epoch": 1.3413915094339623, + "grad_norm": 5.361477851867676, + "learning_rate": 6.704009433962265e-06, + "loss": 0.608, + "num_input_tokens_seen": 1496008, + "step": 2275 + }, + { + "epoch": 1.3443396226415094, + "grad_norm": 2.7907636165618896, + "learning_rate": 6.718750000000001e-06, + "loss": 0.6057, + "num_input_tokens_seen": 1498856, + "step": 2280 + }, + { + "epoch": 1.3472877358490565, + "grad_norm": 2.6450541019439697, + "learning_rate": 6.733490566037736e-06, + "loss": 0.5424, + "num_input_tokens_seen": 1501800, + "step": 2285 + }, + { + "epoch": 1.3502358490566038, + "grad_norm": 3.6860363483428955, + "learning_rate": 6.748231132075472e-06, + "loss": 0.5354, + "num_input_tokens_seen": 1504872, + "step": 2290 + }, + { + "epoch": 1.353183962264151, + "grad_norm": 2.0752086639404297, + "learning_rate": 6.7629716981132076e-06, + "loss": 0.5135, + "num_input_tokens_seen": 1508520, + "step": 2295 + }, + { + "epoch": 1.3561320754716981, + "grad_norm": 3.1735289096832275, + "learning_rate": 6.777712264150944e-06, + "loss": 0.4051, + "num_input_tokens_seen": 1511528, + "step": 2300 + }, + { + "epoch": 1.3590801886792452, + "grad_norm": 4.095285415649414, + "learning_rate": 6.792452830188679e-06, + "loss": 0.5546, + "num_input_tokens_seen": 1514376, + "step": 2305 + }, + { + "epoch": 1.3620283018867925, + "grad_norm": 4.507347106933594, + "learning_rate": 6.807193396226416e-06, + "loss": 0.4569, + "num_input_tokens_seen": 1517896, + "step": 2310 + }, + { + "epoch": 1.3649764150943398, + "grad_norm": 2.3421244621276855, + "learning_rate": 6.8219339622641515e-06, + "loss": 0.419, + "num_input_tokens_seen": 1521224, + "step": 2315 + }, + { + "epoch": 1.3679245283018868, + "grad_norm": 4.5384111404418945, + "learning_rate": 6.836674528301888e-06, + "loss": 0.5165, + "num_input_tokens_seen": 1523848, + "step": 2320 + }, + { + "epoch": 1.3708726415094339, + "grad_norm": 2.6614108085632324, + "learning_rate": 6.851415094339623e-06, + "loss": 0.3965, + "num_input_tokens_seen": 1527912, + "step": 2325 + }, + { + "epoch": 1.3738207547169812, + "grad_norm": 8.495092391967773, + "learning_rate": 6.866155660377359e-06, + "loss": 0.5324, + "num_input_tokens_seen": 1530920, + "step": 2330 + }, + { + "epoch": 1.3767688679245282, + "grad_norm": 8.671310424804688, + "learning_rate": 6.8808962264150946e-06, + "loss": 0.5339, + "num_input_tokens_seen": 1534760, + "step": 2335 + }, + { + "epoch": 1.3797169811320755, + "grad_norm": 5.221708297729492, + "learning_rate": 6.895636792452831e-06, + "loss": 0.6572, + "num_input_tokens_seen": 1538920, + "step": 2340 + }, + { + "epoch": 1.3826650943396226, + "grad_norm": 2.072862148284912, + "learning_rate": 6.910377358490566e-06, + "loss": 0.5419, + "num_input_tokens_seen": 1541928, + "step": 2345 + }, + { + "epoch": 1.3856132075471699, + "grad_norm": 2.0162837505340576, + "learning_rate": 6.925117924528303e-06, + "loss": 0.6152, + "num_input_tokens_seen": 1545672, + "step": 2350 + }, + { + "epoch": 1.388561320754717, + "grad_norm": 3.287925958633423, + "learning_rate": 6.9398584905660385e-06, + "loss": 0.4749, + "num_input_tokens_seen": 1549256, + "step": 2355 + }, + { + "epoch": 1.3915094339622642, + "grad_norm": 2.5606045722961426, + "learning_rate": 6.954599056603775e-06, + "loss": 0.3977, + "num_input_tokens_seen": 1551912, + "step": 2360 + }, + { + "epoch": 1.3944575471698113, + "grad_norm": 3.1936581134796143, + "learning_rate": 6.96933962264151e-06, + "loss": 0.6615, + "num_input_tokens_seen": 1555848, + "step": 2365 + }, + { + "epoch": 1.3974056603773586, + "grad_norm": 3.9231581687927246, + "learning_rate": 6.984080188679245e-06, + "loss": 0.7967, + "num_input_tokens_seen": 1559080, + "step": 2370 + }, + { + "epoch": 1.4003537735849056, + "grad_norm": 7.183620929718018, + "learning_rate": 6.9988207547169815e-06, + "loss": 0.5446, + "num_input_tokens_seen": 1562216, + "step": 2375 + }, + { + "epoch": 1.4033018867924527, + "grad_norm": 2.132411003112793, + "learning_rate": 7.013561320754717e-06, + "loss": 0.5031, + "num_input_tokens_seen": 1566568, + "step": 2380 + }, + { + "epoch": 1.40625, + "grad_norm": 4.0367937088012695, + "learning_rate": 7.028301886792454e-06, + "loss": 0.6425, + "num_input_tokens_seen": 1569608, + "step": 2385 + }, + { + "epoch": 1.4091981132075473, + "grad_norm": 2.6854450702667236, + "learning_rate": 7.043042452830188e-06, + "loss": 0.5996, + "num_input_tokens_seen": 1572968, + "step": 2390 + }, + { + "epoch": 1.4121462264150944, + "grad_norm": 5.15828800201416, + "learning_rate": 7.0577830188679255e-06, + "loss": 0.6181, + "num_input_tokens_seen": 1575848, + "step": 2395 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 2.7500927448272705, + "learning_rate": 7.072523584905661e-06, + "loss": 0.4485, + "num_input_tokens_seen": 1579048, + "step": 2400 + }, + { + "epoch": 1.4180424528301887, + "grad_norm": 7.795233726501465, + "learning_rate": 7.087264150943397e-06, + "loss": 0.6, + "num_input_tokens_seen": 1582408, + "step": 2405 + }, + { + "epoch": 1.4209905660377358, + "grad_norm": 4.185930252075195, + "learning_rate": 7.102004716981132e-06, + "loss": 0.4986, + "num_input_tokens_seen": 1585320, + "step": 2410 + }, + { + "epoch": 1.423938679245283, + "grad_norm": 4.134632110595703, + "learning_rate": 7.1167452830188685e-06, + "loss": 0.545, + "num_input_tokens_seen": 1588584, + "step": 2415 + }, + { + "epoch": 1.4268867924528301, + "grad_norm": 3.4842369556427, + "learning_rate": 7.131485849056604e-06, + "loss": 0.5655, + "num_input_tokens_seen": 1591240, + "step": 2420 + }, + { + "epoch": 1.4298349056603774, + "grad_norm": 2.5747876167297363, + "learning_rate": 7.146226415094341e-06, + "loss": 0.4412, + "num_input_tokens_seen": 1595048, + "step": 2425 + }, + { + "epoch": 1.4327830188679245, + "grad_norm": 2.4336013793945312, + "learning_rate": 7.160966981132076e-06, + "loss": 0.5324, + "num_input_tokens_seen": 1598664, + "step": 2430 + }, + { + "epoch": 1.4357311320754718, + "grad_norm": 6.856317043304443, + "learning_rate": 7.1757075471698125e-06, + "loss": 0.5495, + "num_input_tokens_seen": 1601288, + "step": 2435 + }, + { + "epoch": 1.4386792452830188, + "grad_norm": 2.9326436519622803, + "learning_rate": 7.190448113207548e-06, + "loss": 0.5499, + "num_input_tokens_seen": 1604552, + "step": 2440 + }, + { + "epoch": 1.4416273584905661, + "grad_norm": 3.4927523136138916, + "learning_rate": 7.205188679245284e-06, + "loss": 0.5121, + "num_input_tokens_seen": 1608168, + "step": 2445 + }, + { + "epoch": 1.4445754716981132, + "grad_norm": 3.1840388774871826, + "learning_rate": 7.219929245283019e-06, + "loss": 0.542, + "num_input_tokens_seen": 1611336, + "step": 2450 + }, + { + "epoch": 1.4475235849056602, + "grad_norm": 9.476007461547852, + "learning_rate": 7.2346698113207555e-06, + "loss": 0.5779, + "num_input_tokens_seen": 1614760, + "step": 2455 + }, + { + "epoch": 1.4504716981132075, + "grad_norm": 3.77824068069458, + "learning_rate": 7.249410377358491e-06, + "loss": 0.5957, + "num_input_tokens_seen": 1618376, + "step": 2460 + }, + { + "epoch": 1.4534198113207548, + "grad_norm": 11.594400405883789, + "learning_rate": 7.264150943396226e-06, + "loss": 0.5047, + "num_input_tokens_seen": 1621256, + "step": 2465 + }, + { + "epoch": 1.4563679245283019, + "grad_norm": 1.5943620204925537, + "learning_rate": 7.278891509433963e-06, + "loss": 0.4867, + "num_input_tokens_seen": 1624808, + "step": 2470 + }, + { + "epoch": 1.459316037735849, + "grad_norm": 3.769818067550659, + "learning_rate": 7.293632075471699e-06, + "loss": 0.6964, + "num_input_tokens_seen": 1627688, + "step": 2475 + }, + { + "epoch": 1.4622641509433962, + "grad_norm": 1.8063501119613647, + "learning_rate": 7.308372641509435e-06, + "loss": 0.4521, + "num_input_tokens_seen": 1630216, + "step": 2480 + }, + { + "epoch": 1.4652122641509435, + "grad_norm": 2.5699081420898438, + "learning_rate": 7.32311320754717e-06, + "loss": 0.6018, + "num_input_tokens_seen": 1633800, + "step": 2485 + }, + { + "epoch": 1.4681603773584906, + "grad_norm": 3.077378749847412, + "learning_rate": 7.337853773584906e-06, + "loss": 0.4558, + "num_input_tokens_seen": 1636168, + "step": 2490 + }, + { + "epoch": 1.4711084905660377, + "grad_norm": 3.187596082687378, + "learning_rate": 7.352594339622642e-06, + "loss": 0.4537, + "num_input_tokens_seen": 1638792, + "step": 2495 + }, + { + "epoch": 1.474056603773585, + "grad_norm": 2.2626333236694336, + "learning_rate": 7.367334905660378e-06, + "loss": 0.6826, + "num_input_tokens_seen": 1642088, + "step": 2500 + }, + { + "epoch": 1.477004716981132, + "grad_norm": 2.3543529510498047, + "learning_rate": 7.382075471698113e-06, + "loss": 0.4919, + "num_input_tokens_seen": 1644808, + "step": 2505 + }, + { + "epoch": 1.4799528301886793, + "grad_norm": 4.24591064453125, + "learning_rate": 7.39681603773585e-06, + "loss": 0.5272, + "num_input_tokens_seen": 1647496, + "step": 2510 + }, + { + "epoch": 1.4829009433962264, + "grad_norm": 1.8173989057540894, + "learning_rate": 7.411556603773586e-06, + "loss": 0.4371, + "num_input_tokens_seen": 1650760, + "step": 2515 + }, + { + "epoch": 1.4858490566037736, + "grad_norm": 2.048732042312622, + "learning_rate": 7.426297169811322e-06, + "loss": 0.5664, + "num_input_tokens_seen": 1653864, + "step": 2520 + }, + { + "epoch": 1.4887971698113207, + "grad_norm": 2.793461322784424, + "learning_rate": 7.441037735849057e-06, + "loss": 0.5064, + "num_input_tokens_seen": 1656616, + "step": 2525 + }, + { + "epoch": 1.491745283018868, + "grad_norm": 1.835691213607788, + "learning_rate": 7.455778301886793e-06, + "loss": 0.54, + "num_input_tokens_seen": 1659656, + "step": 2530 + }, + { + "epoch": 1.494693396226415, + "grad_norm": 3.5888102054595947, + "learning_rate": 7.470518867924529e-06, + "loss": 0.6386, + "num_input_tokens_seen": 1662920, + "step": 2535 + }, + { + "epoch": 1.4976415094339623, + "grad_norm": 3.981990098953247, + "learning_rate": 7.485259433962266e-06, + "loss": 0.6062, + "num_input_tokens_seen": 1666472, + "step": 2540 + }, + { + "epoch": 1.5005896226415094, + "grad_norm": 5.066642761230469, + "learning_rate": 7.500000000000001e-06, + "loss": 0.4959, + "num_input_tokens_seen": 1670760, + "step": 2545 + }, + { + "epoch": 1.5035377358490565, + "grad_norm": 2.1030285358428955, + "learning_rate": 7.5147405660377355e-06, + "loss": 0.5104, + "num_input_tokens_seen": 1674120, + "step": 2550 + }, + { + "epoch": 1.5064858490566038, + "grad_norm": 2.4119467735290527, + "learning_rate": 7.5294811320754726e-06, + "loss": 0.4651, + "num_input_tokens_seen": 1677960, + "step": 2555 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 2.145543336868286, + "learning_rate": 7.544221698113208e-06, + "loss": 0.5482, + "num_input_tokens_seen": 1681672, + "step": 2560 + }, + { + "epoch": 1.5123820754716981, + "grad_norm": 7.2220025062561035, + "learning_rate": 7.558962264150944e-06, + "loss": 0.7174, + "num_input_tokens_seen": 1685160, + "step": 2565 + }, + { + "epoch": 1.5153301886792452, + "grad_norm": 2.2348978519439697, + "learning_rate": 7.5737028301886795e-06, + "loss": 0.4796, + "num_input_tokens_seen": 1689032, + "step": 2570 + }, + { + "epoch": 1.5182783018867925, + "grad_norm": 2.266817569732666, + "learning_rate": 7.588443396226416e-06, + "loss": 0.5485, + "num_input_tokens_seen": 1691816, + "step": 2575 + }, + { + "epoch": 1.5212264150943398, + "grad_norm": 2.5918333530426025, + "learning_rate": 7.603183962264151e-06, + "loss": 0.6361, + "num_input_tokens_seen": 1695016, + "step": 2580 + }, + { + "epoch": 1.5241745283018868, + "grad_norm": 2.9727864265441895, + "learning_rate": 7.617924528301888e-06, + "loss": 0.5028, + "num_input_tokens_seen": 1697704, + "step": 2585 + }, + { + "epoch": 1.5271226415094339, + "grad_norm": 5.12477445602417, + "learning_rate": 7.632665094339623e-06, + "loss": 0.5944, + "num_input_tokens_seen": 1700232, + "step": 2590 + }, + { + "epoch": 1.5300707547169812, + "grad_norm": 5.668395519256592, + "learning_rate": 7.64740566037736e-06, + "loss": 0.5677, + "num_input_tokens_seen": 1703144, + "step": 2595 + }, + { + "epoch": 1.5330188679245285, + "grad_norm": 2.8164591789245605, + "learning_rate": 7.662146226415095e-06, + "loss": 0.4764, + "num_input_tokens_seen": 1706600, + "step": 2600 + }, + { + "epoch": 1.5359669811320755, + "grad_norm": 2.616431713104248, + "learning_rate": 7.676886792452832e-06, + "loss": 0.4812, + "num_input_tokens_seen": 1709864, + "step": 2605 + }, + { + "epoch": 1.5389150943396226, + "grad_norm": 4.092056751251221, + "learning_rate": 7.691627358490567e-06, + "loss": 0.6062, + "num_input_tokens_seen": 1712776, + "step": 2610 + }, + { + "epoch": 1.5418632075471699, + "grad_norm": 3.0650835037231445, + "learning_rate": 7.706367924528303e-06, + "loss": 0.3853, + "num_input_tokens_seen": 1715688, + "step": 2615 + }, + { + "epoch": 1.544811320754717, + "grad_norm": 2.5430006980895996, + "learning_rate": 7.721108490566038e-06, + "loss": 0.5598, + "num_input_tokens_seen": 1718632, + "step": 2620 + }, + { + "epoch": 1.547759433962264, + "grad_norm": 2.7769598960876465, + "learning_rate": 7.735849056603775e-06, + "loss": 0.4801, + "num_input_tokens_seen": 1721832, + "step": 2625 + }, + { + "epoch": 1.5507075471698113, + "grad_norm": 4.4878106117248535, + "learning_rate": 7.75058962264151e-06, + "loss": 0.431, + "num_input_tokens_seen": 1724744, + "step": 2630 + }, + { + "epoch": 1.5536556603773586, + "grad_norm": 3.1825881004333496, + "learning_rate": 7.765330188679246e-06, + "loss": 0.6774, + "num_input_tokens_seen": 1727848, + "step": 2635 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 2.2764179706573486, + "learning_rate": 7.780070754716981e-06, + "loss": 0.3957, + "num_input_tokens_seen": 1730408, + "step": 2640 + }, + { + "epoch": 1.5595518867924527, + "grad_norm": 2.125385284423828, + "learning_rate": 7.794811320754716e-06, + "loss": 0.5313, + "num_input_tokens_seen": 1732552, + "step": 2645 + }, + { + "epoch": 1.5625, + "grad_norm": 6.923036098480225, + "learning_rate": 7.809551886792453e-06, + "loss": 0.4426, + "num_input_tokens_seen": 1735080, + "step": 2650 + }, + { + "epoch": 1.5654481132075473, + "grad_norm": 4.45912504196167, + "learning_rate": 7.824292452830189e-06, + "loss": 0.549, + "num_input_tokens_seen": 1737672, + "step": 2655 + }, + { + "epoch": 1.5683962264150944, + "grad_norm": 1.6451839208602905, + "learning_rate": 7.839033018867926e-06, + "loss": 0.7074, + "num_input_tokens_seen": 1741352, + "step": 2660 + }, + { + "epoch": 1.5713443396226414, + "grad_norm": 4.598958492279053, + "learning_rate": 7.853773584905661e-06, + "loss": 0.5314, + "num_input_tokens_seen": 1744360, + "step": 2665 + }, + { + "epoch": 1.5742924528301887, + "grad_norm": 2.721264600753784, + "learning_rate": 7.868514150943397e-06, + "loss": 0.4833, + "num_input_tokens_seen": 1748136, + "step": 2670 + }, + { + "epoch": 1.577240566037736, + "grad_norm": 2.762397050857544, + "learning_rate": 7.883254716981132e-06, + "loss": 0.5651, + "num_input_tokens_seen": 1751912, + "step": 2675 + }, + { + "epoch": 1.580188679245283, + "grad_norm": 4.709754467010498, + "learning_rate": 7.897995283018869e-06, + "loss": 0.7078, + "num_input_tokens_seen": 1755624, + "step": 2680 + }, + { + "epoch": 1.5831367924528301, + "grad_norm": 1.4719289541244507, + "learning_rate": 7.912735849056604e-06, + "loss": 0.6381, + "num_input_tokens_seen": 1759272, + "step": 2685 + }, + { + "epoch": 1.5860849056603774, + "grad_norm": 1.7858352661132812, + "learning_rate": 7.927476415094341e-06, + "loss": 0.3713, + "num_input_tokens_seen": 1761960, + "step": 2690 + }, + { + "epoch": 1.5890330188679245, + "grad_norm": 2.4074413776397705, + "learning_rate": 7.942216981132077e-06, + "loss": 0.6918, + "num_input_tokens_seen": 1764808, + "step": 2695 + }, + { + "epoch": 1.5919811320754715, + "grad_norm": 4.9043731689453125, + "learning_rate": 7.956957547169812e-06, + "loss": 0.5839, + "num_input_tokens_seen": 1768104, + "step": 2700 + }, + { + "epoch": 1.5949292452830188, + "grad_norm": 7.370609283447266, + "learning_rate": 7.971698113207547e-06, + "loss": 0.6301, + "num_input_tokens_seen": 1770856, + "step": 2705 + }, + { + "epoch": 1.5978773584905661, + "grad_norm": 2.049870252609253, + "learning_rate": 7.986438679245284e-06, + "loss": 0.6001, + "num_input_tokens_seen": 1774376, + "step": 2710 + }, + { + "epoch": 1.6008254716981132, + "grad_norm": 1.861102819442749, + "learning_rate": 8.00117924528302e-06, + "loss": 0.5317, + "num_input_tokens_seen": 1778408, + "step": 2715 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 2.4521374702453613, + "learning_rate": 8.015919811320757e-06, + "loss": 0.5523, + "num_input_tokens_seen": 1782984, + "step": 2720 + }, + { + "epoch": 1.6067216981132075, + "grad_norm": 2.4826347827911377, + "learning_rate": 8.030660377358492e-06, + "loss": 0.6567, + "num_input_tokens_seen": 1786216, + "step": 2725 + }, + { + "epoch": 1.6096698113207548, + "grad_norm": 6.061221122741699, + "learning_rate": 8.045400943396227e-06, + "loss": 0.6095, + "num_input_tokens_seen": 1788872, + "step": 2730 + }, + { + "epoch": 1.6126179245283019, + "grad_norm": 3.01009202003479, + "learning_rate": 8.060141509433963e-06, + "loss": 0.453, + "num_input_tokens_seen": 1793288, + "step": 2735 + }, + { + "epoch": 1.615566037735849, + "grad_norm": 2.6989073753356934, + "learning_rate": 8.074882075471698e-06, + "loss": 0.6921, + "num_input_tokens_seen": 1797800, + "step": 2740 + }, + { + "epoch": 1.6185141509433962, + "grad_norm": 1.6631203889846802, + "learning_rate": 8.089622641509435e-06, + "loss": 0.5082, + "num_input_tokens_seen": 1800936, + "step": 2745 + }, + { + "epoch": 1.6214622641509435, + "grad_norm": 4.207683086395264, + "learning_rate": 8.10436320754717e-06, + "loss": 0.6135, + "num_input_tokens_seen": 1803560, + "step": 2750 + }, + { + "epoch": 1.6244103773584906, + "grad_norm": 6.315492630004883, + "learning_rate": 8.119103773584906e-06, + "loss": 0.5156, + "num_input_tokens_seen": 1806280, + "step": 2755 + }, + { + "epoch": 1.6273584905660377, + "grad_norm": 3.7335734367370605, + "learning_rate": 8.133844339622641e-06, + "loss": 0.6498, + "num_input_tokens_seen": 1808904, + "step": 2760 + }, + { + "epoch": 1.630306603773585, + "grad_norm": 2.51202392578125, + "learning_rate": 8.148584905660378e-06, + "loss": 0.5332, + "num_input_tokens_seen": 1811752, + "step": 2765 + }, + { + "epoch": 1.6332547169811322, + "grad_norm": 2.711686134338379, + "learning_rate": 8.163325471698114e-06, + "loss": 0.5591, + "num_input_tokens_seen": 1815784, + "step": 2770 + }, + { + "epoch": 1.6362028301886793, + "grad_norm": 2.3494319915771484, + "learning_rate": 8.17806603773585e-06, + "loss": 0.483, + "num_input_tokens_seen": 1818568, + "step": 2775 + }, + { + "epoch": 1.6391509433962264, + "grad_norm": 4.432181358337402, + "learning_rate": 8.192806603773586e-06, + "loss": 0.5385, + "num_input_tokens_seen": 1821288, + "step": 2780 + }, + { + "epoch": 1.6420990566037736, + "grad_norm": 3.001502275466919, + "learning_rate": 8.207547169811321e-06, + "loss": 0.4553, + "num_input_tokens_seen": 1823624, + "step": 2785 + }, + { + "epoch": 1.6450471698113207, + "grad_norm": 11.780852317810059, + "learning_rate": 8.222287735849057e-06, + "loss": 0.717, + "num_input_tokens_seen": 1830280, + "step": 2790 + }, + { + "epoch": 1.6479952830188678, + "grad_norm": 9.597511291503906, + "learning_rate": 8.237028301886794e-06, + "loss": 0.5962, + "num_input_tokens_seen": 1834280, + "step": 2795 + }, + { + "epoch": 1.650943396226415, + "grad_norm": 2.1464192867279053, + "learning_rate": 8.251768867924529e-06, + "loss": 0.5397, + "num_input_tokens_seen": 1838408, + "step": 2800 + }, + { + "epoch": 1.6538915094339623, + "grad_norm": 3.3558614253997803, + "learning_rate": 8.266509433962266e-06, + "loss": 0.4121, + "num_input_tokens_seen": 1842280, + "step": 2805 + }, + { + "epoch": 1.6568396226415094, + "grad_norm": 5.738358497619629, + "learning_rate": 8.281250000000001e-06, + "loss": 0.5273, + "num_input_tokens_seen": 1846504, + "step": 2810 + }, + { + "epoch": 1.6597877358490565, + "grad_norm": 2.227666139602661, + "learning_rate": 8.295990566037737e-06, + "loss": 0.4514, + "num_input_tokens_seen": 1849864, + "step": 2815 + }, + { + "epoch": 1.6627358490566038, + "grad_norm": 3.857785701751709, + "learning_rate": 8.310731132075472e-06, + "loss": 0.4724, + "num_input_tokens_seen": 1852648, + "step": 2820 + }, + { + "epoch": 1.665683962264151, + "grad_norm": 2.4147236347198486, + "learning_rate": 8.325471698113207e-06, + "loss": 0.4499, + "num_input_tokens_seen": 1855784, + "step": 2825 + }, + { + "epoch": 1.6686320754716981, + "grad_norm": 3.537865400314331, + "learning_rate": 8.340212264150944e-06, + "loss": 0.5938, + "num_input_tokens_seen": 1858888, + "step": 2830 + }, + { + "epoch": 1.6715801886792452, + "grad_norm": 3.631096601486206, + "learning_rate": 8.35495283018868e-06, + "loss": 0.5335, + "num_input_tokens_seen": 1861544, + "step": 2835 + }, + { + "epoch": 1.6745283018867925, + "grad_norm": 4.720225811004639, + "learning_rate": 8.369693396226415e-06, + "loss": 0.4885, + "num_input_tokens_seen": 1864712, + "step": 2840 + }, + { + "epoch": 1.6774764150943398, + "grad_norm": 1.8945636749267578, + "learning_rate": 8.38443396226415e-06, + "loss": 0.4812, + "num_input_tokens_seen": 1868232, + "step": 2845 + }, + { + "epoch": 1.6804245283018868, + "grad_norm": 1.3173551559448242, + "learning_rate": 8.399174528301888e-06, + "loss": 0.4645, + "num_input_tokens_seen": 1871880, + "step": 2850 + }, + { + "epoch": 1.6833726415094339, + "grad_norm": 4.247873306274414, + "learning_rate": 8.413915094339623e-06, + "loss": 0.5878, + "num_input_tokens_seen": 1874856, + "step": 2855 + }, + { + "epoch": 1.6863207547169812, + "grad_norm": 4.463372707366943, + "learning_rate": 8.42865566037736e-06, + "loss": 0.5682, + "num_input_tokens_seen": 1878120, + "step": 2860 + }, + { + "epoch": 1.6892688679245285, + "grad_norm": 2.1462230682373047, + "learning_rate": 8.443396226415095e-06, + "loss": 0.5031, + "num_input_tokens_seen": 1881064, + "step": 2865 + }, + { + "epoch": 1.6922169811320755, + "grad_norm": 3.8219377994537354, + "learning_rate": 8.45813679245283e-06, + "loss": 0.4777, + "num_input_tokens_seen": 1884808, + "step": 2870 + }, + { + "epoch": 1.6951650943396226, + "grad_norm": 3.8765056133270264, + "learning_rate": 8.472877358490566e-06, + "loss": 0.4178, + "num_input_tokens_seen": 1888168, + "step": 2875 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 6.2940144538879395, + "learning_rate": 8.487617924528303e-06, + "loss": 0.5026, + "num_input_tokens_seen": 1890792, + "step": 2880 + }, + { + "epoch": 1.701061320754717, + "grad_norm": 2.945626974105835, + "learning_rate": 8.502358490566038e-06, + "loss": 0.537, + "num_input_tokens_seen": 1894408, + "step": 2885 + }, + { + "epoch": 1.704009433962264, + "grad_norm": 2.455336809158325, + "learning_rate": 8.517099056603775e-06, + "loss": 0.6238, + "num_input_tokens_seen": 1897480, + "step": 2890 + }, + { + "epoch": 1.7069575471698113, + "grad_norm": 3.1407828330993652, + "learning_rate": 8.53183962264151e-06, + "loss": 0.4434, + "num_input_tokens_seen": 1901032, + "step": 2895 + }, + { + "epoch": 1.7099056603773586, + "grad_norm": 1.9645917415618896, + "learning_rate": 8.546580188679246e-06, + "loss": 0.548, + "num_input_tokens_seen": 1904200, + "step": 2900 + }, + { + "epoch": 1.7128537735849056, + "grad_norm": 3.248854637145996, + "learning_rate": 8.561320754716981e-06, + "loss": 0.6487, + "num_input_tokens_seen": 1906984, + "step": 2905 + }, + { + "epoch": 1.7158018867924527, + "grad_norm": 1.8698830604553223, + "learning_rate": 8.576061320754717e-06, + "loss": 0.5328, + "num_input_tokens_seen": 1910664, + "step": 2910 + }, + { + "epoch": 1.71875, + "grad_norm": 2.6611201763153076, + "learning_rate": 8.590801886792454e-06, + "loss": 0.6134, + "num_input_tokens_seen": 1913736, + "step": 2915 + }, + { + "epoch": 1.7216981132075473, + "grad_norm": 2.473524570465088, + "learning_rate": 8.605542452830189e-06, + "loss": 0.5109, + "num_input_tokens_seen": 1916648, + "step": 2920 + }, + { + "epoch": 1.7246462264150944, + "grad_norm": 3.032504081726074, + "learning_rate": 8.620283018867926e-06, + "loss": 0.7577, + "num_input_tokens_seen": 1919272, + "step": 2925 + }, + { + "epoch": 1.7275943396226414, + "grad_norm": 3.726055383682251, + "learning_rate": 8.635023584905662e-06, + "loss": 0.5543, + "num_input_tokens_seen": 1922664, + "step": 2930 + }, + { + "epoch": 1.7305424528301887, + "grad_norm": 7.119819641113281, + "learning_rate": 8.649764150943397e-06, + "loss": 0.4303, + "num_input_tokens_seen": 1925096, + "step": 2935 + }, + { + "epoch": 1.733490566037736, + "grad_norm": 2.822646379470825, + "learning_rate": 8.664504716981132e-06, + "loss": 0.4655, + "num_input_tokens_seen": 1928168, + "step": 2940 + }, + { + "epoch": 1.736438679245283, + "grad_norm": 1.768086552619934, + "learning_rate": 8.67924528301887e-06, + "loss": 0.5053, + "num_input_tokens_seen": 1932808, + "step": 2945 + }, + { + "epoch": 1.7393867924528301, + "grad_norm": 2.935765504837036, + "learning_rate": 8.693985849056605e-06, + "loss": 0.4913, + "num_input_tokens_seen": 1935624, + "step": 2950 + }, + { + "epoch": 1.7423349056603774, + "grad_norm": 3.623666524887085, + "learning_rate": 8.70872641509434e-06, + "loss": 0.4896, + "num_input_tokens_seen": 1938824, + "step": 2955 + }, + { + "epoch": 1.7452830188679245, + "grad_norm": 2.279567241668701, + "learning_rate": 8.723466981132075e-06, + "loss": 0.4742, + "num_input_tokens_seen": 1941864, + "step": 2960 + }, + { + "epoch": 1.7482311320754715, + "grad_norm": 3.118412733078003, + "learning_rate": 8.738207547169812e-06, + "loss": 0.7312, + "num_input_tokens_seen": 1944840, + "step": 2965 + }, + { + "epoch": 1.7511792452830188, + "grad_norm": 2.315941095352173, + "learning_rate": 8.752948113207548e-06, + "loss": 0.3845, + "num_input_tokens_seen": 1947208, + "step": 2970 + }, + { + "epoch": 1.7541273584905661, + "grad_norm": 2.0478763580322266, + "learning_rate": 8.767688679245285e-06, + "loss": 0.4721, + "num_input_tokens_seen": 1949672, + "step": 2975 + }, + { + "epoch": 1.7570754716981132, + "grad_norm": 1.9229283332824707, + "learning_rate": 8.78242924528302e-06, + "loss": 0.577, + "num_input_tokens_seen": 1952616, + "step": 2980 + }, + { + "epoch": 1.7600235849056602, + "grad_norm": 3.9421474933624268, + "learning_rate": 8.797169811320755e-06, + "loss": 0.5033, + "num_input_tokens_seen": 1955016, + "step": 2985 + }, + { + "epoch": 1.7629716981132075, + "grad_norm": 4.925143241882324, + "learning_rate": 8.81191037735849e-06, + "loss": 0.5216, + "num_input_tokens_seen": 1958088, + "step": 2990 + }, + { + "epoch": 1.7659198113207548, + "grad_norm": 3.3835883140563965, + "learning_rate": 8.826650943396226e-06, + "loss": 0.4932, + "num_input_tokens_seen": 1960712, + "step": 2995 + }, + { + "epoch": 1.7688679245283019, + "grad_norm": 2.604309558868408, + "learning_rate": 8.841391509433963e-06, + "loss": 0.6008, + "num_input_tokens_seen": 1964488, + "step": 3000 + }, + { + "epoch": 1.771816037735849, + "grad_norm": 2.102077007293701, + "learning_rate": 8.856132075471698e-06, + "loss": 0.5004, + "num_input_tokens_seen": 1967304, + "step": 3005 + }, + { + "epoch": 1.7747641509433962, + "grad_norm": 2.1938815116882324, + "learning_rate": 8.870872641509435e-06, + "loss": 0.444, + "num_input_tokens_seen": 1970888, + "step": 3010 + }, + { + "epoch": 1.7777122641509435, + "grad_norm": 4.659045219421387, + "learning_rate": 8.88561320754717e-06, + "loss": 0.6787, + "num_input_tokens_seen": 1974568, + "step": 3015 + }, + { + "epoch": 1.7806603773584906, + "grad_norm": 5.26933479309082, + "learning_rate": 8.900353773584906e-06, + "loss": 0.5124, + "num_input_tokens_seen": 1977384, + "step": 3020 + }, + { + "epoch": 1.7836084905660377, + "grad_norm": 1.8948862552642822, + "learning_rate": 8.915094339622642e-06, + "loss": 0.5914, + "num_input_tokens_seen": 1980712, + "step": 3025 + }, + { + "epoch": 1.786556603773585, + "grad_norm": 1.7375677824020386, + "learning_rate": 8.929834905660379e-06, + "loss": 0.4686, + "num_input_tokens_seen": 1983496, + "step": 3030 + }, + { + "epoch": 1.7895047169811322, + "grad_norm": 2.4515326023101807, + "learning_rate": 8.944575471698114e-06, + "loss": 0.4699, + "num_input_tokens_seen": 1986632, + "step": 3035 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 2.278822898864746, + "learning_rate": 8.959316037735851e-06, + "loss": 0.3655, + "num_input_tokens_seen": 1990664, + "step": 3040 + }, + { + "epoch": 1.7954009433962264, + "grad_norm": 2.8040771484375, + "learning_rate": 8.974056603773586e-06, + "loss": 0.5044, + "num_input_tokens_seen": 1993608, + "step": 3045 + }, + { + "epoch": 1.7983490566037736, + "grad_norm": 2.134744644165039, + "learning_rate": 8.988797169811322e-06, + "loss": 0.492, + "num_input_tokens_seen": 1997192, + "step": 3050 + }, + { + "epoch": 1.8012971698113207, + "grad_norm": 2.776709794998169, + "learning_rate": 9.003537735849057e-06, + "loss": 0.5813, + "num_input_tokens_seen": 2001224, + "step": 3055 + }, + { + "epoch": 1.8042452830188678, + "grad_norm": 2.366702079772949, + "learning_rate": 9.018278301886794e-06, + "loss": 0.5031, + "num_input_tokens_seen": 2003848, + "step": 3060 + }, + { + "epoch": 1.807193396226415, + "grad_norm": 5.882743835449219, + "learning_rate": 9.03301886792453e-06, + "loss": 0.6091, + "num_input_tokens_seen": 2006536, + "step": 3065 + }, + { + "epoch": 1.8101415094339623, + "grad_norm": 5.8054327964782715, + "learning_rate": 9.047759433962265e-06, + "loss": 0.5772, + "num_input_tokens_seen": 2010472, + "step": 3070 + }, + { + "epoch": 1.8130896226415094, + "grad_norm": 1.5873346328735352, + "learning_rate": 9.0625e-06, + "loss": 0.457, + "num_input_tokens_seen": 2013672, + "step": 3075 + }, + { + "epoch": 1.8160377358490565, + "grad_norm": 6.886104106903076, + "learning_rate": 9.077240566037735e-06, + "loss": 0.7132, + "num_input_tokens_seen": 2016232, + "step": 3080 + }, + { + "epoch": 1.8189858490566038, + "grad_norm": 1.863874912261963, + "learning_rate": 9.091981132075472e-06, + "loss": 0.5605, + "num_input_tokens_seen": 2019080, + "step": 3085 + }, + { + "epoch": 1.821933962264151, + "grad_norm": 2.7448742389678955, + "learning_rate": 9.106721698113208e-06, + "loss": 0.5342, + "num_input_tokens_seen": 2021928, + "step": 3090 + }, + { + "epoch": 1.8248820754716981, + "grad_norm": 3.2121334075927734, + "learning_rate": 9.121462264150945e-06, + "loss": 0.569, + "num_input_tokens_seen": 2024584, + "step": 3095 + }, + { + "epoch": 1.8278301886792452, + "grad_norm": 2.3998215198516846, + "learning_rate": 9.13620283018868e-06, + "loss": 0.5409, + "num_input_tokens_seen": 2028072, + "step": 3100 + }, + { + "epoch": 1.8307783018867925, + "grad_norm": 2.167696237564087, + "learning_rate": 9.150943396226416e-06, + "loss": 0.4276, + "num_input_tokens_seen": 2031048, + "step": 3105 + }, + { + "epoch": 1.8337264150943398, + "grad_norm": 2.7847344875335693, + "learning_rate": 9.165683962264151e-06, + "loss": 0.4951, + "num_input_tokens_seen": 2033896, + "step": 3110 + }, + { + "epoch": 1.8366745283018868, + "grad_norm": 4.738274097442627, + "learning_rate": 9.180424528301888e-06, + "loss": 0.5943, + "num_input_tokens_seen": 2036232, + "step": 3115 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 10.097674369812012, + "learning_rate": 9.195165094339623e-06, + "loss": 0.5577, + "num_input_tokens_seen": 2039208, + "step": 3120 + }, + { + "epoch": 1.8425707547169812, + "grad_norm": 2.300649404525757, + "learning_rate": 9.20990566037736e-06, + "loss": 0.3829, + "num_input_tokens_seen": 2041992, + "step": 3125 + }, + { + "epoch": 1.8455188679245285, + "grad_norm": 1.9927616119384766, + "learning_rate": 9.224646226415096e-06, + "loss": 0.4404, + "num_input_tokens_seen": 2045480, + "step": 3130 + }, + { + "epoch": 1.8484669811320755, + "grad_norm": 1.9734218120574951, + "learning_rate": 9.239386792452831e-06, + "loss": 0.4147, + "num_input_tokens_seen": 2049224, + "step": 3135 + }, + { + "epoch": 1.8514150943396226, + "grad_norm": 3.009843111038208, + "learning_rate": 9.254127358490566e-06, + "loss": 0.7019, + "num_input_tokens_seen": 2052680, + "step": 3140 + }, + { + "epoch": 1.8543632075471699, + "grad_norm": 4.120747089385986, + "learning_rate": 9.268867924528303e-06, + "loss": 0.4869, + "num_input_tokens_seen": 2055912, + "step": 3145 + }, + { + "epoch": 1.857311320754717, + "grad_norm": 1.9137389659881592, + "learning_rate": 9.283608490566039e-06, + "loss": 0.5925, + "num_input_tokens_seen": 2059496, + "step": 3150 + }, + { + "epoch": 1.860259433962264, + "grad_norm": 2.596644878387451, + "learning_rate": 9.298349056603774e-06, + "loss": 0.5065, + "num_input_tokens_seen": 2062600, + "step": 3155 + }, + { + "epoch": 1.8632075471698113, + "grad_norm": 2.587611198425293, + "learning_rate": 9.31308962264151e-06, + "loss": 0.4166, + "num_input_tokens_seen": 2065704, + "step": 3160 + }, + { + "epoch": 1.8661556603773586, + "grad_norm": 3.6842384338378906, + "learning_rate": 9.327830188679245e-06, + "loss": 0.4984, + "num_input_tokens_seen": 2068104, + "step": 3165 + }, + { + "epoch": 1.8691037735849056, + "grad_norm": 8.193092346191406, + "learning_rate": 9.342570754716982e-06, + "loss": 0.6733, + "num_input_tokens_seen": 2071240, + "step": 3170 + }, + { + "epoch": 1.8720518867924527, + "grad_norm": 1.2968326807022095, + "learning_rate": 9.357311320754717e-06, + "loss": 0.4775, + "num_input_tokens_seen": 2076456, + "step": 3175 + }, + { + "epoch": 1.875, + "grad_norm": 1.3121097087860107, + "learning_rate": 9.372051886792454e-06, + "loss": 0.4993, + "num_input_tokens_seen": 2079720, + "step": 3180 + }, + { + "epoch": 1.8779481132075473, + "grad_norm": 2.8920905590057373, + "learning_rate": 9.38679245283019e-06, + "loss": 0.7159, + "num_input_tokens_seen": 2083528, + "step": 3185 + }, + { + "epoch": 1.8808962264150944, + "grad_norm": 1.498838186264038, + "learning_rate": 9.401533018867925e-06, + "loss": 0.5269, + "num_input_tokens_seen": 2087400, + "step": 3190 + }, + { + "epoch": 1.8838443396226414, + "grad_norm": 2.6425116062164307, + "learning_rate": 9.41627358490566e-06, + "loss": 0.4166, + "num_input_tokens_seen": 2091176, + "step": 3195 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 2.202831983566284, + "learning_rate": 9.431014150943397e-06, + "loss": 0.4323, + "num_input_tokens_seen": 2093928, + "step": 3200 + }, + { + "epoch": 1.889740566037736, + "grad_norm": 2.5115792751312256, + "learning_rate": 9.445754716981133e-06, + "loss": 0.4202, + "num_input_tokens_seen": 2097992, + "step": 3205 + }, + { + "epoch": 1.892688679245283, + "grad_norm": 1.6988894939422607, + "learning_rate": 9.46049528301887e-06, + "loss": 0.5423, + "num_input_tokens_seen": 2101256, + "step": 3210 + }, + { + "epoch": 1.8956367924528301, + "grad_norm": 3.428349018096924, + "learning_rate": 9.475235849056605e-06, + "loss": 0.4868, + "num_input_tokens_seen": 2104488, + "step": 3215 + }, + { + "epoch": 1.8985849056603774, + "grad_norm": 6.883533954620361, + "learning_rate": 9.48997641509434e-06, + "loss": 0.475, + "num_input_tokens_seen": 2107464, + "step": 3220 + }, + { + "epoch": 1.9015330188679245, + "grad_norm": 3.8597731590270996, + "learning_rate": 9.504716981132076e-06, + "loss": 0.7006, + "num_input_tokens_seen": 2110280, + "step": 3225 + }, + { + "epoch": 1.9044811320754715, + "grad_norm": 4.466874599456787, + "learning_rate": 9.519457547169813e-06, + "loss": 0.5718, + "num_input_tokens_seen": 2113384, + "step": 3230 + }, + { + "epoch": 1.9074292452830188, + "grad_norm": 1.9670289754867554, + "learning_rate": 9.534198113207548e-06, + "loss": 0.5183, + "num_input_tokens_seen": 2116520, + "step": 3235 + }, + { + "epoch": 1.9103773584905661, + "grad_norm": 3.1198627948760986, + "learning_rate": 9.548938679245285e-06, + "loss": 0.5145, + "num_input_tokens_seen": 2119528, + "step": 3240 + }, + { + "epoch": 1.9133254716981132, + "grad_norm": 2.5630526542663574, + "learning_rate": 9.56367924528302e-06, + "loss": 0.4883, + "num_input_tokens_seen": 2122440, + "step": 3245 + }, + { + "epoch": 1.9162735849056602, + "grad_norm": 1.8640820980072021, + "learning_rate": 9.578419811320756e-06, + "loss": 0.5062, + "num_input_tokens_seen": 2125352, + "step": 3250 + }, + { + "epoch": 1.9192216981132075, + "grad_norm": 2.2215869426727295, + "learning_rate": 9.593160377358491e-06, + "loss": 0.5346, + "num_input_tokens_seen": 2128680, + "step": 3255 + }, + { + "epoch": 1.9221698113207548, + "grad_norm": 3.1649980545043945, + "learning_rate": 9.607900943396226e-06, + "loss": 0.3889, + "num_input_tokens_seen": 2132040, + "step": 3260 + }, + { + "epoch": 1.9251179245283019, + "grad_norm": 4.021702766418457, + "learning_rate": 9.622641509433963e-06, + "loss": 0.4944, + "num_input_tokens_seen": 2135592, + "step": 3265 + }, + { + "epoch": 1.928066037735849, + "grad_norm": 2.178122043609619, + "learning_rate": 9.637382075471699e-06, + "loss": 0.5111, + "num_input_tokens_seen": 2138312, + "step": 3270 + }, + { + "epoch": 1.9310141509433962, + "grad_norm": 3.385652780532837, + "learning_rate": 9.652122641509434e-06, + "loss": 0.5849, + "num_input_tokens_seen": 2141512, + "step": 3275 + }, + { + "epoch": 1.9339622641509435, + "grad_norm": 2.067254066467285, + "learning_rate": 9.66686320754717e-06, + "loss": 0.4269, + "num_input_tokens_seen": 2143528, + "step": 3280 + }, + { + "epoch": 1.9369103773584906, + "grad_norm": 1.750783920288086, + "learning_rate": 9.681603773584907e-06, + "loss": 0.5524, + "num_input_tokens_seen": 2147432, + "step": 3285 + }, + { + "epoch": 1.9398584905660377, + "grad_norm": 3.428699254989624, + "learning_rate": 9.696344339622642e-06, + "loss": 0.5061, + "num_input_tokens_seen": 2150024, + "step": 3290 + }, + { + "epoch": 1.942806603773585, + "grad_norm": 1.5365263223648071, + "learning_rate": 9.711084905660379e-06, + "loss": 0.4664, + "num_input_tokens_seen": 2153160, + "step": 3295 + }, + { + "epoch": 1.9457547169811322, + "grad_norm": 2.592742681503296, + "learning_rate": 9.725825471698114e-06, + "loss": 0.5878, + "num_input_tokens_seen": 2155944, + "step": 3300 + }, + { + "epoch": 1.9487028301886793, + "grad_norm": 2.3278636932373047, + "learning_rate": 9.74056603773585e-06, + "loss": 0.5118, + "num_input_tokens_seen": 2160360, + "step": 3305 + }, + { + "epoch": 1.9516509433962264, + "grad_norm": 3.8329479694366455, + "learning_rate": 9.755306603773585e-06, + "loss": 0.5316, + "num_input_tokens_seen": 2163080, + "step": 3310 + }, + { + "epoch": 1.9545990566037736, + "grad_norm": 1.9475082159042358, + "learning_rate": 9.770047169811322e-06, + "loss": 0.4617, + "num_input_tokens_seen": 2167176, + "step": 3315 + }, + { + "epoch": 1.9575471698113207, + "grad_norm": 3.7722434997558594, + "learning_rate": 9.784787735849057e-06, + "loss": 0.5588, + "num_input_tokens_seen": 2169928, + "step": 3320 + }, + { + "epoch": 1.9604952830188678, + "grad_norm": 3.540411949157715, + "learning_rate": 9.799528301886794e-06, + "loss": 0.7276, + "num_input_tokens_seen": 2172872, + "step": 3325 + }, + { + "epoch": 1.963443396226415, + "grad_norm": 2.510077714920044, + "learning_rate": 9.81426886792453e-06, + "loss": 0.5249, + "num_input_tokens_seen": 2175752, + "step": 3330 + }, + { + "epoch": 1.9663915094339623, + "grad_norm": 3.3462636470794678, + "learning_rate": 9.829009433962265e-06, + "loss": 0.5895, + "num_input_tokens_seen": 2179528, + "step": 3335 + }, + { + "epoch": 1.9693396226415094, + "grad_norm": 1.5495750904083252, + "learning_rate": 9.84375e-06, + "loss": 0.4641, + "num_input_tokens_seen": 2182824, + "step": 3340 + }, + { + "epoch": 1.9722877358490565, + "grad_norm": 1.9227246046066284, + "learning_rate": 9.858490566037736e-06, + "loss": 0.5313, + "num_input_tokens_seen": 2185960, + "step": 3345 + }, + { + "epoch": 1.9752358490566038, + "grad_norm": 5.357235431671143, + "learning_rate": 9.873231132075473e-06, + "loss": 0.4859, + "num_input_tokens_seen": 2189512, + "step": 3350 + }, + { + "epoch": 1.978183962264151, + "grad_norm": 2.080068588256836, + "learning_rate": 9.887971698113208e-06, + "loss": 0.6377, + "num_input_tokens_seen": 2193064, + "step": 3355 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 2.2908098697662354, + "learning_rate": 9.902712264150945e-06, + "loss": 0.5751, + "num_input_tokens_seen": 2195560, + "step": 3360 + }, + { + "epoch": 1.9840801886792452, + "grad_norm": 2.213653326034546, + "learning_rate": 9.917452830188679e-06, + "loss": 0.6579, + "num_input_tokens_seen": 2200616, + "step": 3365 + }, + { + "epoch": 1.9870283018867925, + "grad_norm": 2.739194393157959, + "learning_rate": 9.932193396226416e-06, + "loss": 0.6471, + "num_input_tokens_seen": 2203880, + "step": 3370 + }, + { + "epoch": 1.9899764150943398, + "grad_norm": 3.510056257247925, + "learning_rate": 9.946933962264151e-06, + "loss": 0.5257, + "num_input_tokens_seen": 2206664, + "step": 3375 + }, + { + "epoch": 1.9929245283018868, + "grad_norm": 2.35871958732605, + "learning_rate": 9.961674528301888e-06, + "loss": 0.5255, + "num_input_tokens_seen": 2208808, + "step": 3380 + }, + { + "epoch": 1.9958726415094339, + "grad_norm": 2.699148416519165, + "learning_rate": 9.976415094339624e-06, + "loss": 0.6001, + "num_input_tokens_seen": 2212200, + "step": 3385 + }, + { + "epoch": 1.9988207547169812, + "grad_norm": 12.069649696350098, + "learning_rate": 9.991155660377359e-06, + "loss": 0.6065, + "num_input_tokens_seen": 2215656, + "step": 3390 + }, + { + "epoch": 2.0, + "eval_loss": 0.5241208672523499, + "eval_runtime": 19.0435, + "eval_samples_per_second": 89.059, + "eval_steps_per_second": 22.265, + "num_input_tokens_seen": 2216560, + "step": 3392 + }, + { + "epoch": 2.0017688679245285, + "grad_norm": 2.4409494400024414, + "learning_rate": 9.999999894098275e-06, + "loss": 0.4593, + "num_input_tokens_seen": 2218416, + "step": 3395 + }, + { + "epoch": 2.0047169811320753, + "grad_norm": 2.269171953201294, + "learning_rate": 9.99999870270391e-06, + "loss": 0.4976, + "num_input_tokens_seen": 2222480, + "step": 3400 + }, + { + "epoch": 2.0076650943396226, + "grad_norm": 3.3023276329040527, + "learning_rate": 9.999996187538341e-06, + "loss": 0.4705, + "num_input_tokens_seen": 2224752, + "step": 3405 + }, + { + "epoch": 2.01061320754717, + "grad_norm": 4.064953327178955, + "learning_rate": 9.999992348602233e-06, + "loss": 0.4812, + "num_input_tokens_seen": 2227216, + "step": 3410 + }, + { + "epoch": 2.013561320754717, + "grad_norm": 2.483228921890259, + "learning_rate": 9.999987185896598e-06, + "loss": 0.6215, + "num_input_tokens_seen": 2230224, + "step": 3415 + }, + { + "epoch": 2.016509433962264, + "grad_norm": 2.4251623153686523, + "learning_rate": 9.99998069942281e-06, + "loss": 0.5158, + "num_input_tokens_seen": 2234288, + "step": 3420 + }, + { + "epoch": 2.0194575471698113, + "grad_norm": 2.1537842750549316, + "learning_rate": 9.999972889182583e-06, + "loss": 0.5134, + "num_input_tokens_seen": 2238224, + "step": 3425 + }, + { + "epoch": 2.0224056603773586, + "grad_norm": 2.692521572113037, + "learning_rate": 9.999963755177984e-06, + "loss": 0.6871, + "num_input_tokens_seen": 2240624, + "step": 3430 + }, + { + "epoch": 2.025353773584906, + "grad_norm": 2.8487043380737305, + "learning_rate": 9.999953297411434e-06, + "loss": 0.6166, + "num_input_tokens_seen": 2243504, + "step": 3435 + }, + { + "epoch": 2.0283018867924527, + "grad_norm": 1.4369525909423828, + "learning_rate": 9.999941515885699e-06, + "loss": 0.5816, + "num_input_tokens_seen": 2247600, + "step": 3440 + }, + { + "epoch": 2.03125, + "grad_norm": 2.5074074268341064, + "learning_rate": 9.999928410603897e-06, + "loss": 0.5001, + "num_input_tokens_seen": 2250544, + "step": 3445 + }, + { + "epoch": 2.0341981132075473, + "grad_norm": 3.1471071243286133, + "learning_rate": 9.999913981569502e-06, + "loss": 0.5998, + "num_input_tokens_seen": 2253552, + "step": 3450 + }, + { + "epoch": 2.037146226415094, + "grad_norm": 14.052713394165039, + "learning_rate": 9.999898228786332e-06, + "loss": 0.5376, + "num_input_tokens_seen": 2256208, + "step": 3455 + }, + { + "epoch": 2.0400943396226414, + "grad_norm": 3.779376983642578, + "learning_rate": 9.999881152258557e-06, + "loss": 0.5176, + "num_input_tokens_seen": 2259536, + "step": 3460 + }, + { + "epoch": 2.0430424528301887, + "grad_norm": 2.664102077484131, + "learning_rate": 9.999862751990697e-06, + "loss": 0.497, + "num_input_tokens_seen": 2262448, + "step": 3465 + }, + { + "epoch": 2.045990566037736, + "grad_norm": 2.0035548210144043, + "learning_rate": 9.999843027987628e-06, + "loss": 0.455, + "num_input_tokens_seen": 2266160, + "step": 3470 + }, + { + "epoch": 2.048938679245283, + "grad_norm": 3.569155216217041, + "learning_rate": 9.999821980254567e-06, + "loss": 0.5841, + "num_input_tokens_seen": 2269232, + "step": 3475 + }, + { + "epoch": 2.05188679245283, + "grad_norm": 1.706276774406433, + "learning_rate": 9.99979960879709e-06, + "loss": 0.4524, + "num_input_tokens_seen": 2272720, + "step": 3480 + }, + { + "epoch": 2.0548349056603774, + "grad_norm": 3.2420685291290283, + "learning_rate": 9.99977591362112e-06, + "loss": 0.4712, + "num_input_tokens_seen": 2275632, + "step": 3485 + }, + { + "epoch": 2.0577830188679247, + "grad_norm": 6.191486835479736, + "learning_rate": 9.999750894732927e-06, + "loss": 0.3858, + "num_input_tokens_seen": 2277936, + "step": 3490 + }, + { + "epoch": 2.0607311320754715, + "grad_norm": 2.5046162605285645, + "learning_rate": 9.999724552139136e-06, + "loss": 0.6106, + "num_input_tokens_seen": 2280624, + "step": 3495 + }, + { + "epoch": 2.063679245283019, + "grad_norm": 2.8309781551361084, + "learning_rate": 9.999696885846724e-06, + "loss": 0.5104, + "num_input_tokens_seen": 2287312, + "step": 3500 + }, + { + "epoch": 2.066627358490566, + "grad_norm": 1.7976757287979126, + "learning_rate": 9.999667895863012e-06, + "loss": 0.4961, + "num_input_tokens_seen": 2289904, + "step": 3505 + }, + { + "epoch": 2.0695754716981134, + "grad_norm": 2.188403606414795, + "learning_rate": 9.99963758219568e-06, + "loss": 0.4808, + "num_input_tokens_seen": 2293264, + "step": 3510 + }, + { + "epoch": 2.0725235849056602, + "grad_norm": 1.96734619140625, + "learning_rate": 9.999605944852749e-06, + "loss": 0.454, + "num_input_tokens_seen": 2296048, + "step": 3515 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 2.087714195251465, + "learning_rate": 9.999572983842599e-06, + "loss": 0.4187, + "num_input_tokens_seen": 2299280, + "step": 3520 + }, + { + "epoch": 2.078419811320755, + "grad_norm": 1.04439115524292, + "learning_rate": 9.999538699173951e-06, + "loss": 0.4511, + "num_input_tokens_seen": 2302128, + "step": 3525 + }, + { + "epoch": 2.081367924528302, + "grad_norm": 3.2082581520080566, + "learning_rate": 9.99950309085589e-06, + "loss": 0.4407, + "num_input_tokens_seen": 2306896, + "step": 3530 + }, + { + "epoch": 2.084316037735849, + "grad_norm": 10.509344100952148, + "learning_rate": 9.999466158897835e-06, + "loss": 0.4563, + "num_input_tokens_seen": 2309808, + "step": 3535 + }, + { + "epoch": 2.0872641509433962, + "grad_norm": 2.3370521068573, + "learning_rate": 9.999427903309569e-06, + "loss": 0.566, + "num_input_tokens_seen": 2313104, + "step": 3540 + }, + { + "epoch": 2.0902122641509435, + "grad_norm": 1.587203025817871, + "learning_rate": 9.99938832410122e-06, + "loss": 0.5894, + "num_input_tokens_seen": 2316368, + "step": 3545 + }, + { + "epoch": 2.0931603773584904, + "grad_norm": 1.8656679391860962, + "learning_rate": 9.999347421283267e-06, + "loss": 0.459, + "num_input_tokens_seen": 2319184, + "step": 3550 + }, + { + "epoch": 2.0961084905660377, + "grad_norm": 2.6150450706481934, + "learning_rate": 9.999305194866538e-06, + "loss": 0.6886, + "num_input_tokens_seen": 2322896, + "step": 3555 + }, + { + "epoch": 2.099056603773585, + "grad_norm": 1.2895225286483765, + "learning_rate": 9.99926164486221e-06, + "loss": 0.4123, + "num_input_tokens_seen": 2326224, + "step": 3560 + }, + { + "epoch": 2.1020047169811322, + "grad_norm": 3.896390676498413, + "learning_rate": 9.99921677128182e-06, + "loss": 0.5663, + "num_input_tokens_seen": 2329552, + "step": 3565 + }, + { + "epoch": 2.104952830188679, + "grad_norm": 1.6260031461715698, + "learning_rate": 9.99917057413724e-06, + "loss": 0.6796, + "num_input_tokens_seen": 2334288, + "step": 3570 + }, + { + "epoch": 2.1079009433962264, + "grad_norm": 2.6255950927734375, + "learning_rate": 9.99912305344071e-06, + "loss": 0.5253, + "num_input_tokens_seen": 2337968, + "step": 3575 + }, + { + "epoch": 2.1108490566037736, + "grad_norm": 1.1682586669921875, + "learning_rate": 9.999074209204803e-06, + "loss": 0.4651, + "num_input_tokens_seen": 2342160, + "step": 3580 + }, + { + "epoch": 2.113797169811321, + "grad_norm": 2.3923587799072266, + "learning_rate": 9.999024041442455e-06, + "loss": 0.4624, + "num_input_tokens_seen": 2346000, + "step": 3585 + }, + { + "epoch": 2.1167452830188678, + "grad_norm": 1.3662807941436768, + "learning_rate": 9.998972550166948e-06, + "loss": 0.4804, + "num_input_tokens_seen": 2348656, + "step": 3590 + }, + { + "epoch": 2.119693396226415, + "grad_norm": 2.483086585998535, + "learning_rate": 9.998919735391915e-06, + "loss": 0.427, + "num_input_tokens_seen": 2351344, + "step": 3595 + }, + { + "epoch": 2.1226415094339623, + "grad_norm": 2.371692419052124, + "learning_rate": 9.998865597131336e-06, + "loss": 0.5055, + "num_input_tokens_seen": 2354160, + "step": 3600 + }, + { + "epoch": 2.1255896226415096, + "grad_norm": 1.2813798189163208, + "learning_rate": 9.998810135399545e-06, + "loss": 0.4365, + "num_input_tokens_seen": 2357744, + "step": 3605 + }, + { + "epoch": 2.1285377358490565, + "grad_norm": 1.744266152381897, + "learning_rate": 9.99875335021123e-06, + "loss": 0.4861, + "num_input_tokens_seen": 2361264, + "step": 3610 + }, + { + "epoch": 2.1314858490566038, + "grad_norm": 2.644942283630371, + "learning_rate": 9.998695241581423e-06, + "loss": 0.394, + "num_input_tokens_seen": 2365072, + "step": 3615 + }, + { + "epoch": 2.134433962264151, + "grad_norm": 1.478508710861206, + "learning_rate": 9.998635809525504e-06, + "loss": 0.5237, + "num_input_tokens_seen": 2367856, + "step": 3620 + }, + { + "epoch": 2.137382075471698, + "grad_norm": 2.8589885234832764, + "learning_rate": 9.998575054059212e-06, + "loss": 0.4819, + "num_input_tokens_seen": 2370576, + "step": 3625 + }, + { + "epoch": 2.140330188679245, + "grad_norm": 7.645646095275879, + "learning_rate": 9.998512975198633e-06, + "loss": 0.4702, + "num_input_tokens_seen": 2372912, + "step": 3630 + }, + { + "epoch": 2.1432783018867925, + "grad_norm": 1.6633960008621216, + "learning_rate": 9.998449572960202e-06, + "loss": 0.4821, + "num_input_tokens_seen": 2376688, + "step": 3635 + }, + { + "epoch": 2.1462264150943398, + "grad_norm": 3.9250335693359375, + "learning_rate": 9.998384847360705e-06, + "loss": 0.8291, + "num_input_tokens_seen": 2379568, + "step": 3640 + }, + { + "epoch": 2.1491745283018866, + "grad_norm": 1.8899283409118652, + "learning_rate": 9.998318798417276e-06, + "loss": 0.5631, + "num_input_tokens_seen": 2383056, + "step": 3645 + }, + { + "epoch": 2.152122641509434, + "grad_norm": 6.645150184631348, + "learning_rate": 9.998251426147403e-06, + "loss": 0.4873, + "num_input_tokens_seen": 2385872, + "step": 3650 + }, + { + "epoch": 2.155070754716981, + "grad_norm": 2.8679258823394775, + "learning_rate": 9.998182730568927e-06, + "loss": 0.5898, + "num_input_tokens_seen": 2388560, + "step": 3655 + }, + { + "epoch": 2.1580188679245285, + "grad_norm": 2.6323959827423096, + "learning_rate": 9.998112711700028e-06, + "loss": 0.5721, + "num_input_tokens_seen": 2392016, + "step": 3660 + }, + { + "epoch": 2.1609669811320753, + "grad_norm": 2.7856907844543457, + "learning_rate": 9.99804136955925e-06, + "loss": 0.6504, + "num_input_tokens_seen": 2394576, + "step": 3665 + }, + { + "epoch": 2.1639150943396226, + "grad_norm": 3.2992379665374756, + "learning_rate": 9.99796870416548e-06, + "loss": 0.5721, + "num_input_tokens_seen": 2397424, + "step": 3670 + }, + { + "epoch": 2.16686320754717, + "grad_norm": 2.566983461380005, + "learning_rate": 9.997894715537953e-06, + "loss": 0.4476, + "num_input_tokens_seen": 2400848, + "step": 3675 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 1.0927411317825317, + "learning_rate": 9.997819403696263e-06, + "loss": 0.4068, + "num_input_tokens_seen": 2404688, + "step": 3680 + }, + { + "epoch": 2.172759433962264, + "grad_norm": 1.9017906188964844, + "learning_rate": 9.997742768660345e-06, + "loss": 0.6369, + "num_input_tokens_seen": 2407632, + "step": 3685 + }, + { + "epoch": 2.1757075471698113, + "grad_norm": 2.437448024749756, + "learning_rate": 9.99766481045049e-06, + "loss": 0.456, + "num_input_tokens_seen": 2412464, + "step": 3690 + }, + { + "epoch": 2.1786556603773586, + "grad_norm": 3.0124521255493164, + "learning_rate": 9.997585529087338e-06, + "loss": 0.3509, + "num_input_tokens_seen": 2415440, + "step": 3695 + }, + { + "epoch": 2.1816037735849054, + "grad_norm": 1.8364503383636475, + "learning_rate": 9.997504924591878e-06, + "loss": 0.6574, + "num_input_tokens_seen": 2418928, + "step": 3700 + }, + { + "epoch": 2.1845518867924527, + "grad_norm": 1.6308305263519287, + "learning_rate": 9.99742299698545e-06, + "loss": 0.4771, + "num_input_tokens_seen": 2422160, + "step": 3705 + }, + { + "epoch": 2.1875, + "grad_norm": 2.1061747074127197, + "learning_rate": 9.997339746289749e-06, + "loss": 0.4365, + "num_input_tokens_seen": 2426352, + "step": 3710 + }, + { + "epoch": 2.1904481132075473, + "grad_norm": 1.671177625656128, + "learning_rate": 9.997255172526812e-06, + "loss": 0.4523, + "num_input_tokens_seen": 2430544, + "step": 3715 + }, + { + "epoch": 2.1933962264150946, + "grad_norm": 1.9334760904312134, + "learning_rate": 9.99716927571903e-06, + "loss": 0.5889, + "num_input_tokens_seen": 2433744, + "step": 3720 + }, + { + "epoch": 2.1963443396226414, + "grad_norm": 2.145174741744995, + "learning_rate": 9.997082055889147e-06, + "loss": 0.5652, + "num_input_tokens_seen": 2436624, + "step": 3725 + }, + { + "epoch": 2.1992924528301887, + "grad_norm": 2.908642292022705, + "learning_rate": 9.996993513060252e-06, + "loss": 0.4681, + "num_input_tokens_seen": 2438672, + "step": 3730 + }, + { + "epoch": 2.202240566037736, + "grad_norm": 1.6083283424377441, + "learning_rate": 9.996903647255789e-06, + "loss": 0.3877, + "num_input_tokens_seen": 2442256, + "step": 3735 + }, + { + "epoch": 2.205188679245283, + "grad_norm": 3.1025285720825195, + "learning_rate": 9.99681245849955e-06, + "loss": 0.5571, + "num_input_tokens_seen": 2445264, + "step": 3740 + }, + { + "epoch": 2.20813679245283, + "grad_norm": 20.3349666595459, + "learning_rate": 9.996719946815679e-06, + "loss": 0.5638, + "num_input_tokens_seen": 2448816, + "step": 3745 + }, + { + "epoch": 2.2110849056603774, + "grad_norm": 4.318816661834717, + "learning_rate": 9.996626112228665e-06, + "loss": 0.5967, + "num_input_tokens_seen": 2451344, + "step": 3750 + }, + { + "epoch": 2.2140330188679247, + "grad_norm": 1.7857407331466675, + "learning_rate": 9.996530954763355e-06, + "loss": 0.3934, + "num_input_tokens_seen": 2453808, + "step": 3755 + }, + { + "epoch": 2.2169811320754715, + "grad_norm": 2.3087985515594482, + "learning_rate": 9.99643447444494e-06, + "loss": 0.3961, + "num_input_tokens_seen": 2456816, + "step": 3760 + }, + { + "epoch": 2.219929245283019, + "grad_norm": 1.8218660354614258, + "learning_rate": 9.996336671298965e-06, + "loss": 0.5955, + "num_input_tokens_seen": 2460816, + "step": 3765 + }, + { + "epoch": 2.222877358490566, + "grad_norm": 10.457060813903809, + "learning_rate": 9.996237545351323e-06, + "loss": 0.6665, + "num_input_tokens_seen": 2464016, + "step": 3770 + }, + { + "epoch": 2.2258254716981134, + "grad_norm": 2.463376760482788, + "learning_rate": 9.996137096628259e-06, + "loss": 0.4216, + "num_input_tokens_seen": 2467536, + "step": 3775 + }, + { + "epoch": 2.2287735849056602, + "grad_norm": 2.1375865936279297, + "learning_rate": 9.996035325156366e-06, + "loss": 0.4539, + "num_input_tokens_seen": 2470928, + "step": 3780 + }, + { + "epoch": 2.2317216981132075, + "grad_norm": 1.808367371559143, + "learning_rate": 9.995932230962589e-06, + "loss": 0.3544, + "num_input_tokens_seen": 2474928, + "step": 3785 + }, + { + "epoch": 2.234669811320755, + "grad_norm": 1.7464582920074463, + "learning_rate": 9.995827814074223e-06, + "loss": 0.7117, + "num_input_tokens_seen": 2477456, + "step": 3790 + }, + { + "epoch": 2.237617924528302, + "grad_norm": 2.25754976272583, + "learning_rate": 9.995722074518913e-06, + "loss": 0.6492, + "num_input_tokens_seen": 2479888, + "step": 3795 + }, + { + "epoch": 2.240566037735849, + "grad_norm": 1.4202998876571655, + "learning_rate": 9.99561501232465e-06, + "loss": 0.2765, + "num_input_tokens_seen": 2483344, + "step": 3800 + }, + { + "epoch": 2.2435141509433962, + "grad_norm": 1.5416580438613892, + "learning_rate": 9.995506627519786e-06, + "loss": 0.4672, + "num_input_tokens_seen": 2486448, + "step": 3805 + }, + { + "epoch": 2.2464622641509435, + "grad_norm": 2.608919620513916, + "learning_rate": 9.995396920133012e-06, + "loss": 0.7162, + "num_input_tokens_seen": 2489392, + "step": 3810 + }, + { + "epoch": 2.2494103773584904, + "grad_norm": 1.4983595609664917, + "learning_rate": 9.995285890193373e-06, + "loss": 0.4994, + "num_input_tokens_seen": 2492400, + "step": 3815 + }, + { + "epoch": 2.2523584905660377, + "grad_norm": 2.3803086280822754, + "learning_rate": 9.995173537730267e-06, + "loss": 0.4635, + "num_input_tokens_seen": 2495088, + "step": 3820 + }, + { + "epoch": 2.255306603773585, + "grad_norm": 1.5899994373321533, + "learning_rate": 9.99505986277344e-06, + "loss": 0.4547, + "num_input_tokens_seen": 2498640, + "step": 3825 + }, + { + "epoch": 2.2582547169811322, + "grad_norm": 2.9083821773529053, + "learning_rate": 9.994944865352986e-06, + "loss": 0.5333, + "num_input_tokens_seen": 2501424, + "step": 3830 + }, + { + "epoch": 2.261202830188679, + "grad_norm": 1.6034116744995117, + "learning_rate": 9.994828545499351e-06, + "loss": 0.461, + "num_input_tokens_seen": 2504816, + "step": 3835 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 1.83975350856781, + "learning_rate": 9.994710903243334e-06, + "loss": 0.6171, + "num_input_tokens_seen": 2508272, + "step": 3840 + }, + { + "epoch": 2.2670990566037736, + "grad_norm": 1.3657933473587036, + "learning_rate": 9.994591938616079e-06, + "loss": 0.5321, + "num_input_tokens_seen": 2511664, + "step": 3845 + }, + { + "epoch": 2.270047169811321, + "grad_norm": 3.2497334480285645, + "learning_rate": 9.994471651649082e-06, + "loss": 0.646, + "num_input_tokens_seen": 2514384, + "step": 3850 + }, + { + "epoch": 2.2729952830188678, + "grad_norm": 4.231931209564209, + "learning_rate": 9.99435004237419e-06, + "loss": 0.4872, + "num_input_tokens_seen": 2517136, + "step": 3855 + }, + { + "epoch": 2.275943396226415, + "grad_norm": 1.5508261919021606, + "learning_rate": 9.9942271108236e-06, + "loss": 0.3894, + "num_input_tokens_seen": 2520464, + "step": 3860 + }, + { + "epoch": 2.2788915094339623, + "grad_norm": 2.5334084033966064, + "learning_rate": 9.994102857029859e-06, + "loss": 0.5706, + "num_input_tokens_seen": 2523568, + "step": 3865 + }, + { + "epoch": 2.2818396226415096, + "grad_norm": 2.0650854110717773, + "learning_rate": 9.993977281025862e-06, + "loss": 0.55, + "num_input_tokens_seen": 2526928, + "step": 3870 + }, + { + "epoch": 2.2847877358490565, + "grad_norm": 2.080620288848877, + "learning_rate": 9.993850382844858e-06, + "loss": 0.53, + "num_input_tokens_seen": 2530864, + "step": 3875 + }, + { + "epoch": 2.2877358490566038, + "grad_norm": 1.3480745553970337, + "learning_rate": 9.993722162520443e-06, + "loss": 0.504, + "num_input_tokens_seen": 2534992, + "step": 3880 + }, + { + "epoch": 2.290683962264151, + "grad_norm": 2.177626848220825, + "learning_rate": 9.993592620086564e-06, + "loss": 0.6044, + "num_input_tokens_seen": 2538032, + "step": 3885 + }, + { + "epoch": 2.293632075471698, + "grad_norm": 1.4110749959945679, + "learning_rate": 9.993461755577518e-06, + "loss": 0.5696, + "num_input_tokens_seen": 2540912, + "step": 3890 + }, + { + "epoch": 2.296580188679245, + "grad_norm": 1.6280128955841064, + "learning_rate": 9.99332956902795e-06, + "loss": 0.3586, + "num_input_tokens_seen": 2543600, + "step": 3895 + }, + { + "epoch": 2.2995283018867925, + "grad_norm": 2.3977067470550537, + "learning_rate": 9.993196060472859e-06, + "loss": 0.5448, + "num_input_tokens_seen": 2547408, + "step": 3900 + }, + { + "epoch": 2.3024764150943398, + "grad_norm": 1.3785814046859741, + "learning_rate": 9.993061229947591e-06, + "loss": 0.4536, + "num_input_tokens_seen": 2550352, + "step": 3905 + }, + { + "epoch": 2.3054245283018866, + "grad_norm": 2.375364065170288, + "learning_rate": 9.992925077487845e-06, + "loss": 0.571, + "num_input_tokens_seen": 2553456, + "step": 3910 + }, + { + "epoch": 2.308372641509434, + "grad_norm": 2.570561170578003, + "learning_rate": 9.992787603129666e-06, + "loss": 0.5683, + "num_input_tokens_seen": 2556560, + "step": 3915 + }, + { + "epoch": 2.311320754716981, + "grad_norm": 3.5251641273498535, + "learning_rate": 9.99264880690945e-06, + "loss": 0.5614, + "num_input_tokens_seen": 2560208, + "step": 3920 + }, + { + "epoch": 2.3142688679245285, + "grad_norm": 4.972915172576904, + "learning_rate": 9.992508688863947e-06, + "loss": 0.5344, + "num_input_tokens_seen": 2562768, + "step": 3925 + }, + { + "epoch": 2.3172169811320753, + "grad_norm": 2.352879047393799, + "learning_rate": 9.99236724903025e-06, + "loss": 0.4333, + "num_input_tokens_seen": 2566416, + "step": 3930 + }, + { + "epoch": 2.3201650943396226, + "grad_norm": 13.549699783325195, + "learning_rate": 9.992224487445809e-06, + "loss": 0.5047, + "num_input_tokens_seen": 2569616, + "step": 3935 + }, + { + "epoch": 2.32311320754717, + "grad_norm": 3.3971519470214844, + "learning_rate": 9.99208040414842e-06, + "loss": 0.4573, + "num_input_tokens_seen": 2572560, + "step": 3940 + }, + { + "epoch": 2.326061320754717, + "grad_norm": 1.5874186754226685, + "learning_rate": 9.99193499917623e-06, + "loss": 0.4473, + "num_input_tokens_seen": 2575312, + "step": 3945 + }, + { + "epoch": 2.329009433962264, + "grad_norm": 1.1310614347457886, + "learning_rate": 9.991788272567735e-06, + "loss": 0.4401, + "num_input_tokens_seen": 2578448, + "step": 3950 + }, + { + "epoch": 2.3319575471698113, + "grad_norm": 1.1150015592575073, + "learning_rate": 9.991640224361781e-06, + "loss": 0.4222, + "num_input_tokens_seen": 2582384, + "step": 3955 + }, + { + "epoch": 2.3349056603773586, + "grad_norm": 1.9409315586090088, + "learning_rate": 9.991490854597563e-06, + "loss": 0.3575, + "num_input_tokens_seen": 2585456, + "step": 3960 + }, + { + "epoch": 2.3378537735849054, + "grad_norm": 2.227268934249878, + "learning_rate": 9.991340163314632e-06, + "loss": 0.4573, + "num_input_tokens_seen": 2590448, + "step": 3965 + }, + { + "epoch": 2.3408018867924527, + "grad_norm": 1.3155293464660645, + "learning_rate": 9.991188150552878e-06, + "loss": 0.5791, + "num_input_tokens_seen": 2593872, + "step": 3970 + }, + { + "epoch": 2.34375, + "grad_norm": 2.829507827758789, + "learning_rate": 9.991034816352553e-06, + "loss": 0.5234, + "num_input_tokens_seen": 2597648, + "step": 3975 + }, + { + "epoch": 2.3466981132075473, + "grad_norm": 1.6845955848693848, + "learning_rate": 9.990880160754248e-06, + "loss": 0.4496, + "num_input_tokens_seen": 2600048, + "step": 3980 + }, + { + "epoch": 2.3496462264150946, + "grad_norm": 2.4369006156921387, + "learning_rate": 9.990724183798914e-06, + "loss": 0.4534, + "num_input_tokens_seen": 2603632, + "step": 3985 + }, + { + "epoch": 2.3525943396226414, + "grad_norm": 1.6276899576187134, + "learning_rate": 9.990566885527841e-06, + "loss": 0.4845, + "num_input_tokens_seen": 2607216, + "step": 3990 + }, + { + "epoch": 2.3555424528301887, + "grad_norm": 1.5323408842086792, + "learning_rate": 9.99040826598268e-06, + "loss": 0.5368, + "num_input_tokens_seen": 2610256, + "step": 3995 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 2.649141311645508, + "learning_rate": 9.990248325205418e-06, + "loss": 0.4007, + "num_input_tokens_seen": 2613616, + "step": 4000 + }, + { + "epoch": 2.361438679245283, + "grad_norm": 2.233700752258301, + "learning_rate": 9.990087063238408e-06, + "loss": 0.5299, + "num_input_tokens_seen": 2617104, + "step": 4005 + }, + { + "epoch": 2.36438679245283, + "grad_norm": 9.76878547668457, + "learning_rate": 9.989924480124342e-06, + "loss": 0.5633, + "num_input_tokens_seen": 2619856, + "step": 4010 + }, + { + "epoch": 2.3673349056603774, + "grad_norm": 1.9927561283111572, + "learning_rate": 9.989760575906264e-06, + "loss": 0.5369, + "num_input_tokens_seen": 2623344, + "step": 4015 + }, + { + "epoch": 2.3702830188679247, + "grad_norm": 3.241832733154297, + "learning_rate": 9.98959535062757e-06, + "loss": 0.5063, + "num_input_tokens_seen": 2627024, + "step": 4020 + }, + { + "epoch": 2.3732311320754715, + "grad_norm": 1.706433653831482, + "learning_rate": 9.989428804332002e-06, + "loss": 0.4324, + "num_input_tokens_seen": 2629968, + "step": 4025 + }, + { + "epoch": 2.376179245283019, + "grad_norm": 4.104156970977783, + "learning_rate": 9.989260937063656e-06, + "loss": 0.524, + "num_input_tokens_seen": 2633328, + "step": 4030 + }, + { + "epoch": 2.379127358490566, + "grad_norm": 1.3404372930526733, + "learning_rate": 9.989091748866973e-06, + "loss": 0.4958, + "num_input_tokens_seen": 2637360, + "step": 4035 + }, + { + "epoch": 2.3820754716981134, + "grad_norm": 2.513782024383545, + "learning_rate": 9.988921239786748e-06, + "loss": 0.436, + "num_input_tokens_seen": 2640496, + "step": 4040 + }, + { + "epoch": 2.3850235849056602, + "grad_norm": 2.727499485015869, + "learning_rate": 9.988749409868124e-06, + "loss": 0.5028, + "num_input_tokens_seen": 2643280, + "step": 4045 + }, + { + "epoch": 2.3879716981132075, + "grad_norm": 1.549669861793518, + "learning_rate": 9.988576259156593e-06, + "loss": 0.5083, + "num_input_tokens_seen": 2646000, + "step": 4050 + }, + { + "epoch": 2.390919811320755, + "grad_norm": 1.8130030632019043, + "learning_rate": 9.988401787697996e-06, + "loss": 0.4772, + "num_input_tokens_seen": 2649520, + "step": 4055 + }, + { + "epoch": 2.393867924528302, + "grad_norm": 2.0780134201049805, + "learning_rate": 9.98822599553853e-06, + "loss": 0.5954, + "num_input_tokens_seen": 2652528, + "step": 4060 + }, + { + "epoch": 2.396816037735849, + "grad_norm": 2.7208192348480225, + "learning_rate": 9.988048882724732e-06, + "loss": 0.519, + "num_input_tokens_seen": 2655184, + "step": 4065 + }, + { + "epoch": 2.3997641509433962, + "grad_norm": 3.3026835918426514, + "learning_rate": 9.987870449303497e-06, + "loss": 0.4735, + "num_input_tokens_seen": 2658192, + "step": 4070 + }, + { + "epoch": 2.4027122641509435, + "grad_norm": 2.592559814453125, + "learning_rate": 9.98769069532206e-06, + "loss": 0.4967, + "num_input_tokens_seen": 2660944, + "step": 4075 + }, + { + "epoch": 2.4056603773584904, + "grad_norm": 2.1517956256866455, + "learning_rate": 9.98750962082802e-06, + "loss": 0.5323, + "num_input_tokens_seen": 2663280, + "step": 4080 + }, + { + "epoch": 2.4086084905660377, + "grad_norm": 1.9323025941848755, + "learning_rate": 9.987327225869312e-06, + "loss": 0.6117, + "num_input_tokens_seen": 2666576, + "step": 4085 + }, + { + "epoch": 2.411556603773585, + "grad_norm": 1.5806341171264648, + "learning_rate": 9.987143510494225e-06, + "loss": 0.5078, + "num_input_tokens_seen": 2670064, + "step": 4090 + }, + { + "epoch": 2.4145047169811322, + "grad_norm": 1.1306326389312744, + "learning_rate": 9.9869584747514e-06, + "loss": 0.4714, + "num_input_tokens_seen": 2673680, + "step": 4095 + }, + { + "epoch": 2.417452830188679, + "grad_norm": 1.7992746829986572, + "learning_rate": 9.986772118689828e-06, + "loss": 0.535, + "num_input_tokens_seen": 2676688, + "step": 4100 + }, + { + "epoch": 2.4204009433962264, + "grad_norm": 1.7710566520690918, + "learning_rate": 9.986584442358845e-06, + "loss": 0.6292, + "num_input_tokens_seen": 2681200, + "step": 4105 + }, + { + "epoch": 2.4233490566037736, + "grad_norm": 1.5686014890670776, + "learning_rate": 9.98639544580814e-06, + "loss": 0.4655, + "num_input_tokens_seen": 2684336, + "step": 4110 + }, + { + "epoch": 2.426297169811321, + "grad_norm": 1.2640758752822876, + "learning_rate": 9.98620512908775e-06, + "loss": 0.4135, + "num_input_tokens_seen": 2687344, + "step": 4115 + }, + { + "epoch": 2.4292452830188678, + "grad_norm": 3.3218111991882324, + "learning_rate": 9.986013492248064e-06, + "loss": 0.3735, + "num_input_tokens_seen": 2690224, + "step": 4120 + }, + { + "epoch": 2.432193396226415, + "grad_norm": 1.7268750667572021, + "learning_rate": 9.985820535339817e-06, + "loss": 0.4501, + "num_input_tokens_seen": 2693104, + "step": 4125 + }, + { + "epoch": 2.4351415094339623, + "grad_norm": 1.0716211795806885, + "learning_rate": 9.985626258414093e-06, + "loss": 0.4762, + "num_input_tokens_seen": 2695248, + "step": 4130 + }, + { + "epoch": 2.4380896226415096, + "grad_norm": 1.8963731527328491, + "learning_rate": 9.985430661522333e-06, + "loss": 0.3703, + "num_input_tokens_seen": 2698672, + "step": 4135 + }, + { + "epoch": 2.4410377358490565, + "grad_norm": 3.09753155708313, + "learning_rate": 9.98523374471632e-06, + "loss": 0.5363, + "num_input_tokens_seen": 2701264, + "step": 4140 + }, + { + "epoch": 2.4439858490566038, + "grad_norm": 1.3349759578704834, + "learning_rate": 9.985035508048186e-06, + "loss": 0.5639, + "num_input_tokens_seen": 2704464, + "step": 4145 + }, + { + "epoch": 2.446933962264151, + "grad_norm": 2.703191041946411, + "learning_rate": 9.984835951570417e-06, + "loss": 0.509, + "num_input_tokens_seen": 2708080, + "step": 4150 + }, + { + "epoch": 2.449882075471698, + "grad_norm": 2.7819161415100098, + "learning_rate": 9.984635075335847e-06, + "loss": 0.5347, + "num_input_tokens_seen": 2710832, + "step": 4155 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 1.8129849433898926, + "learning_rate": 9.984432879397659e-06, + "loss": 0.5451, + "num_input_tokens_seen": 2713712, + "step": 4160 + }, + { + "epoch": 2.4557783018867925, + "grad_norm": 2.087080717086792, + "learning_rate": 9.984229363809383e-06, + "loss": 0.4389, + "num_input_tokens_seen": 2716688, + "step": 4165 + }, + { + "epoch": 2.4587264150943398, + "grad_norm": 1.6445637941360474, + "learning_rate": 9.984024528624904e-06, + "loss": 0.3589, + "num_input_tokens_seen": 2719440, + "step": 4170 + }, + { + "epoch": 2.4616745283018866, + "grad_norm": 3.5430915355682373, + "learning_rate": 9.98381837389845e-06, + "loss": 0.4865, + "num_input_tokens_seen": 2721840, + "step": 4175 + }, + { + "epoch": 2.464622641509434, + "grad_norm": 1.6669857501983643, + "learning_rate": 9.983610899684601e-06, + "loss": 0.5398, + "num_input_tokens_seen": 2726192, + "step": 4180 + }, + { + "epoch": 2.467570754716981, + "grad_norm": 3.279939889907837, + "learning_rate": 9.983402106038292e-06, + "loss": 0.5636, + "num_input_tokens_seen": 2728816, + "step": 4185 + }, + { + "epoch": 2.4705188679245285, + "grad_norm": 7.45900821685791, + "learning_rate": 9.983191993014793e-06, + "loss": 0.5628, + "num_input_tokens_seen": 2731344, + "step": 4190 + }, + { + "epoch": 2.4734669811320753, + "grad_norm": 2.8235793113708496, + "learning_rate": 9.982980560669742e-06, + "loss": 0.4141, + "num_input_tokens_seen": 2734096, + "step": 4195 + }, + { + "epoch": 2.4764150943396226, + "grad_norm": 2.873528480529785, + "learning_rate": 9.98276780905911e-06, + "loss": 0.4932, + "num_input_tokens_seen": 2737040, + "step": 4200 + }, + { + "epoch": 2.47936320754717, + "grad_norm": 1.5973680019378662, + "learning_rate": 9.982553738239225e-06, + "loss": 0.4105, + "num_input_tokens_seen": 2740528, + "step": 4205 + }, + { + "epoch": 2.482311320754717, + "grad_norm": 2.9403560161590576, + "learning_rate": 9.982338348266766e-06, + "loss": 0.4165, + "num_input_tokens_seen": 2743728, + "step": 4210 + }, + { + "epoch": 2.485259433962264, + "grad_norm": 2.4394748210906982, + "learning_rate": 9.982121639198756e-06, + "loss": 0.5364, + "num_input_tokens_seen": 2748464, + "step": 4215 + }, + { + "epoch": 2.4882075471698113, + "grad_norm": 2.896411895751953, + "learning_rate": 9.98190361109257e-06, + "loss": 0.5718, + "num_input_tokens_seen": 2752784, + "step": 4220 + }, + { + "epoch": 2.4911556603773586, + "grad_norm": 1.8368418216705322, + "learning_rate": 9.981684264005934e-06, + "loss": 0.7493, + "num_input_tokens_seen": 2755728, + "step": 4225 + }, + { + "epoch": 2.4941037735849054, + "grad_norm": 1.5333503484725952, + "learning_rate": 9.981463597996917e-06, + "loss": 0.4883, + "num_input_tokens_seen": 2758960, + "step": 4230 + }, + { + "epoch": 2.4970518867924527, + "grad_norm": 3.050692319869995, + "learning_rate": 9.981241613123944e-06, + "loss": 0.532, + "num_input_tokens_seen": 2762480, + "step": 4235 + }, + { + "epoch": 2.5, + "grad_norm": 1.6425899267196655, + "learning_rate": 9.981018309445785e-06, + "loss": 0.406, + "num_input_tokens_seen": 2765808, + "step": 4240 + }, + { + "epoch": 2.5029481132075473, + "grad_norm": 6.3773112297058105, + "learning_rate": 9.980793687021564e-06, + "loss": 0.4636, + "num_input_tokens_seen": 2768720, + "step": 4245 + }, + { + "epoch": 2.5058962264150946, + "grad_norm": 2.54063081741333, + "learning_rate": 9.980567745910746e-06, + "loss": 0.433, + "num_input_tokens_seen": 2772624, + "step": 4250 + }, + { + "epoch": 2.5088443396226414, + "grad_norm": 1.9294198751449585, + "learning_rate": 9.980340486173155e-06, + "loss": 0.76, + "num_input_tokens_seen": 2776048, + "step": 4255 + }, + { + "epoch": 2.5117924528301887, + "grad_norm": 2.7724790573120117, + "learning_rate": 9.980111907868954e-06, + "loss": 0.5102, + "num_input_tokens_seen": 2778576, + "step": 4260 + }, + { + "epoch": 2.514740566037736, + "grad_norm": 2.284367799758911, + "learning_rate": 9.979882011058662e-06, + "loss": 0.4261, + "num_input_tokens_seen": 2781328, + "step": 4265 + }, + { + "epoch": 2.517688679245283, + "grad_norm": 1.9525171518325806, + "learning_rate": 9.979650795803146e-06, + "loss": 0.5924, + "num_input_tokens_seen": 2784432, + "step": 4270 + }, + { + "epoch": 2.52063679245283, + "grad_norm": 1.3154902458190918, + "learning_rate": 9.979418262163621e-06, + "loss": 0.3881, + "num_input_tokens_seen": 2787088, + "step": 4275 + }, + { + "epoch": 2.5235849056603774, + "grad_norm": 2.033742904663086, + "learning_rate": 9.979184410201652e-06, + "loss": 0.4902, + "num_input_tokens_seen": 2790000, + "step": 4280 + }, + { + "epoch": 2.5265330188679247, + "grad_norm": 1.8792797327041626, + "learning_rate": 9.97894923997915e-06, + "loss": 0.4323, + "num_input_tokens_seen": 2792688, + "step": 4285 + }, + { + "epoch": 2.5294811320754715, + "grad_norm": 2.144305467605591, + "learning_rate": 9.97871275155838e-06, + "loss": 0.5608, + "num_input_tokens_seen": 2796400, + "step": 4290 + }, + { + "epoch": 2.532429245283019, + "grad_norm": 1.8108692169189453, + "learning_rate": 9.978474945001949e-06, + "loss": 0.5732, + "num_input_tokens_seen": 2799280, + "step": 4295 + }, + { + "epoch": 2.535377358490566, + "grad_norm": 1.6497257947921753, + "learning_rate": 9.978235820372822e-06, + "loss": 0.3648, + "num_input_tokens_seen": 2802672, + "step": 4300 + }, + { + "epoch": 2.538325471698113, + "grad_norm": 1.7847671508789062, + "learning_rate": 9.977995377734307e-06, + "loss": 0.5498, + "num_input_tokens_seen": 2805360, + "step": 4305 + }, + { + "epoch": 2.5412735849056602, + "grad_norm": 3.381925106048584, + "learning_rate": 9.977753617150061e-06, + "loss": 0.3648, + "num_input_tokens_seen": 2809040, + "step": 4310 + }, + { + "epoch": 2.5442216981132075, + "grad_norm": 2.541572093963623, + "learning_rate": 9.977510538684094e-06, + "loss": 0.5923, + "num_input_tokens_seen": 2812080, + "step": 4315 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 3.1444692611694336, + "learning_rate": 9.977266142400757e-06, + "loss": 0.5526, + "num_input_tokens_seen": 2816400, + "step": 4320 + }, + { + "epoch": 2.550117924528302, + "grad_norm": 3.7041258811950684, + "learning_rate": 9.977020428364759e-06, + "loss": 0.5813, + "num_input_tokens_seen": 2820144, + "step": 4325 + }, + { + "epoch": 2.553066037735849, + "grad_norm": 2.6744472980499268, + "learning_rate": 9.976773396641154e-06, + "loss": 0.4183, + "num_input_tokens_seen": 2823184, + "step": 4330 + }, + { + "epoch": 2.5560141509433962, + "grad_norm": 2.6345441341400146, + "learning_rate": 9.976525047295342e-06, + "loss": 0.3819, + "num_input_tokens_seen": 2826320, + "step": 4335 + }, + { + "epoch": 2.5589622641509435, + "grad_norm": 1.8418720960617065, + "learning_rate": 9.976275380393077e-06, + "loss": 0.4535, + "num_input_tokens_seen": 2830928, + "step": 4340 + }, + { + "epoch": 2.5619103773584904, + "grad_norm": 3.202420711517334, + "learning_rate": 9.976024396000459e-06, + "loss": 0.5383, + "num_input_tokens_seen": 2833968, + "step": 4345 + }, + { + "epoch": 2.5648584905660377, + "grad_norm": 1.9077577590942383, + "learning_rate": 9.975772094183935e-06, + "loss": 0.5048, + "num_input_tokens_seen": 2837392, + "step": 4350 + }, + { + "epoch": 2.567806603773585, + "grad_norm": 2.1330316066741943, + "learning_rate": 9.975518475010306e-06, + "loss": 0.5276, + "num_input_tokens_seen": 2840048, + "step": 4355 + }, + { + "epoch": 2.5707547169811322, + "grad_norm": 2.8477582931518555, + "learning_rate": 9.975263538546717e-06, + "loss": 0.4091, + "num_input_tokens_seen": 2842768, + "step": 4360 + }, + { + "epoch": 2.5737028301886795, + "grad_norm": 1.9988538026809692, + "learning_rate": 9.975007284860664e-06, + "loss": 0.531, + "num_input_tokens_seen": 2846512, + "step": 4365 + }, + { + "epoch": 2.5766509433962264, + "grad_norm": 1.7546887397766113, + "learning_rate": 9.974749714019993e-06, + "loss": 0.6445, + "num_input_tokens_seen": 2849808, + "step": 4370 + }, + { + "epoch": 2.5795990566037736, + "grad_norm": 5.490547180175781, + "learning_rate": 9.974490826092894e-06, + "loss": 0.4806, + "num_input_tokens_seen": 2853360, + "step": 4375 + }, + { + "epoch": 2.5825471698113205, + "grad_norm": 1.0776605606079102, + "learning_rate": 9.974230621147907e-06, + "loss": 0.4182, + "num_input_tokens_seen": 2857296, + "step": 4380 + }, + { + "epoch": 2.5854952830188678, + "grad_norm": 2.1588449478149414, + "learning_rate": 9.973969099253928e-06, + "loss": 0.5549, + "num_input_tokens_seen": 2861136, + "step": 4385 + }, + { + "epoch": 2.588443396226415, + "grad_norm": 2.1320228576660156, + "learning_rate": 9.973706260480194e-06, + "loss": 0.4759, + "num_input_tokens_seen": 2863888, + "step": 4390 + }, + { + "epoch": 2.5913915094339623, + "grad_norm": 4.187750339508057, + "learning_rate": 9.97344210489629e-06, + "loss": 0.5552, + "num_input_tokens_seen": 2867504, + "step": 4395 + }, + { + "epoch": 2.5943396226415096, + "grad_norm": 1.5328081846237183, + "learning_rate": 9.973176632572158e-06, + "loss": 0.47, + "num_input_tokens_seen": 2871056, + "step": 4400 + }, + { + "epoch": 2.5972877358490565, + "grad_norm": 1.2889806032180786, + "learning_rate": 9.972909843578076e-06, + "loss": 0.5567, + "num_input_tokens_seen": 2873712, + "step": 4405 + }, + { + "epoch": 2.6002358490566038, + "grad_norm": 1.62148916721344, + "learning_rate": 9.972641737984681e-06, + "loss": 0.6168, + "num_input_tokens_seen": 2876720, + "step": 4410 + }, + { + "epoch": 2.603183962264151, + "grad_norm": 1.920217752456665, + "learning_rate": 9.972372315862956e-06, + "loss": 0.478, + "num_input_tokens_seen": 2879664, + "step": 4415 + }, + { + "epoch": 2.606132075471698, + "grad_norm": 2.1170432567596436, + "learning_rate": 9.97210157728423e-06, + "loss": 0.4594, + "num_input_tokens_seen": 2882704, + "step": 4420 + }, + { + "epoch": 2.609080188679245, + "grad_norm": 6.475421905517578, + "learning_rate": 9.971829522320185e-06, + "loss": 0.3238, + "num_input_tokens_seen": 2890448, + "step": 4425 + }, + { + "epoch": 2.6120283018867925, + "grad_norm": 2.1703567504882812, + "learning_rate": 9.971556151042843e-06, + "loss": 0.4571, + "num_input_tokens_seen": 2893424, + "step": 4430 + }, + { + "epoch": 2.6149764150943398, + "grad_norm": 1.6484168767929077, + "learning_rate": 9.971281463524588e-06, + "loss": 0.5833, + "num_input_tokens_seen": 2899600, + "step": 4435 + }, + { + "epoch": 2.617924528301887, + "grad_norm": 1.962673544883728, + "learning_rate": 9.971005459838136e-06, + "loss": 0.4556, + "num_input_tokens_seen": 2903600, + "step": 4440 + }, + { + "epoch": 2.620872641509434, + "grad_norm": 2.691267490386963, + "learning_rate": 9.970728140056567e-06, + "loss": 0.5396, + "num_input_tokens_seen": 2907088, + "step": 4445 + }, + { + "epoch": 2.623820754716981, + "grad_norm": 1.9610198736190796, + "learning_rate": 9.9704495042533e-06, + "loss": 0.4049, + "num_input_tokens_seen": 2910672, + "step": 4450 + }, + { + "epoch": 2.6267688679245285, + "grad_norm": 2.563339948654175, + "learning_rate": 9.970169552502105e-06, + "loss": 0.5402, + "num_input_tokens_seen": 2913008, + "step": 4455 + }, + { + "epoch": 2.6297169811320753, + "grad_norm": 2.2070553302764893, + "learning_rate": 9.969888284877102e-06, + "loss": 0.5486, + "num_input_tokens_seen": 2916240, + "step": 4460 + }, + { + "epoch": 2.6326650943396226, + "grad_norm": 2.5004208087921143, + "learning_rate": 9.969605701452757e-06, + "loss": 0.3449, + "num_input_tokens_seen": 2919408, + "step": 4465 + }, + { + "epoch": 2.63561320754717, + "grad_norm": 3.3195559978485107, + "learning_rate": 9.969321802303882e-06, + "loss": 0.4772, + "num_input_tokens_seen": 2922224, + "step": 4470 + }, + { + "epoch": 2.638561320754717, + "grad_norm": 2.4619383811950684, + "learning_rate": 9.969036587505644e-06, + "loss": 0.4397, + "num_input_tokens_seen": 2925616, + "step": 4475 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 5.624481678009033, + "learning_rate": 9.968750057133555e-06, + "loss": 0.3676, + "num_input_tokens_seen": 2928304, + "step": 4480 + }, + { + "epoch": 2.6444575471698113, + "grad_norm": 3.099343776702881, + "learning_rate": 9.968462211263474e-06, + "loss": 0.5841, + "num_input_tokens_seen": 2931440, + "step": 4485 + }, + { + "epoch": 2.6474056603773586, + "grad_norm": 1.4520219564437866, + "learning_rate": 9.96817304997161e-06, + "loss": 0.3536, + "num_input_tokens_seen": 2934416, + "step": 4490 + }, + { + "epoch": 2.6503537735849054, + "grad_norm": 6.3251776695251465, + "learning_rate": 9.967882573334519e-06, + "loss": 0.5362, + "num_input_tokens_seen": 2936912, + "step": 4495 + }, + { + "epoch": 2.6533018867924527, + "grad_norm": 1.255062460899353, + "learning_rate": 9.967590781429106e-06, + "loss": 0.4731, + "num_input_tokens_seen": 2940912, + "step": 4500 + }, + { + "epoch": 2.65625, + "grad_norm": 5.443777084350586, + "learning_rate": 9.967297674332625e-06, + "loss": 0.5464, + "num_input_tokens_seen": 2944336, + "step": 4505 + }, + { + "epoch": 2.6591981132075473, + "grad_norm": 1.4844576120376587, + "learning_rate": 9.967003252122675e-06, + "loss": 0.4632, + "num_input_tokens_seen": 2948208, + "step": 4510 + }, + { + "epoch": 2.6621462264150946, + "grad_norm": 2.8074753284454346, + "learning_rate": 9.96670751487721e-06, + "loss": 0.5545, + "num_input_tokens_seen": 2951088, + "step": 4515 + }, + { + "epoch": 2.6650943396226414, + "grad_norm": 2.77970290184021, + "learning_rate": 9.966410462674525e-06, + "loss": 0.4656, + "num_input_tokens_seen": 2954000, + "step": 4520 + }, + { + "epoch": 2.6680424528301887, + "grad_norm": 2.996859550476074, + "learning_rate": 9.966112095593264e-06, + "loss": 0.4189, + "num_input_tokens_seen": 2956624, + "step": 4525 + }, + { + "epoch": 2.670990566037736, + "grad_norm": 2.449484348297119, + "learning_rate": 9.965812413712425e-06, + "loss": 0.5658, + "num_input_tokens_seen": 2959568, + "step": 4530 + }, + { + "epoch": 2.673938679245283, + "grad_norm": 1.4806125164031982, + "learning_rate": 9.965511417111346e-06, + "loss": 0.5916, + "num_input_tokens_seen": 2962928, + "step": 4535 + }, + { + "epoch": 2.67688679245283, + "grad_norm": 4.137581825256348, + "learning_rate": 9.96520910586972e-06, + "loss": 0.509, + "num_input_tokens_seen": 2966352, + "step": 4540 + }, + { + "epoch": 2.6798349056603774, + "grad_norm": 0.9742993116378784, + "learning_rate": 9.964905480067585e-06, + "loss": 0.3752, + "num_input_tokens_seen": 2969584, + "step": 4545 + }, + { + "epoch": 2.6827830188679247, + "grad_norm": 3.860938310623169, + "learning_rate": 9.964600539785328e-06, + "loss": 0.3916, + "num_input_tokens_seen": 2972016, + "step": 4550 + }, + { + "epoch": 2.6857311320754715, + "grad_norm": 3.1197290420532227, + "learning_rate": 9.96429428510368e-06, + "loss": 0.6724, + "num_input_tokens_seen": 2974800, + "step": 4555 + }, + { + "epoch": 2.688679245283019, + "grad_norm": 2.4082212448120117, + "learning_rate": 9.963986716103724e-06, + "loss": 0.4719, + "num_input_tokens_seen": 2977136, + "step": 4560 + }, + { + "epoch": 2.691627358490566, + "grad_norm": 2.8378469944000244, + "learning_rate": 9.963677832866893e-06, + "loss": 0.4485, + "num_input_tokens_seen": 2979888, + "step": 4565 + }, + { + "epoch": 2.694575471698113, + "grad_norm": 2.55816388130188, + "learning_rate": 9.963367635474962e-06, + "loss": 0.572, + "num_input_tokens_seen": 2983248, + "step": 4570 + }, + { + "epoch": 2.6975235849056602, + "grad_norm": 2.482215642929077, + "learning_rate": 9.96305612401006e-06, + "loss": 0.5488, + "num_input_tokens_seen": 2986320, + "step": 4575 + }, + { + "epoch": 2.7004716981132075, + "grad_norm": 6.601240158081055, + "learning_rate": 9.96274329855466e-06, + "loss": 0.6084, + "num_input_tokens_seen": 2989392, + "step": 4580 + }, + { + "epoch": 2.703419811320755, + "grad_norm": 3.119270086288452, + "learning_rate": 9.962429159191583e-06, + "loss": 0.5299, + "num_input_tokens_seen": 2992400, + "step": 4585 + }, + { + "epoch": 2.706367924528302, + "grad_norm": 4.138728618621826, + "learning_rate": 9.962113706003997e-06, + "loss": 0.4753, + "num_input_tokens_seen": 2995056, + "step": 4590 + }, + { + "epoch": 2.709316037735849, + "grad_norm": 2.0090062618255615, + "learning_rate": 9.961796939075424e-06, + "loss": 0.5708, + "num_input_tokens_seen": 2998352, + "step": 4595 + }, + { + "epoch": 2.7122641509433962, + "grad_norm": 2.013165235519409, + "learning_rate": 9.961478858489728e-06, + "loss": 0.4354, + "num_input_tokens_seen": 3002256, + "step": 4600 + }, + { + "epoch": 2.7152122641509435, + "grad_norm": 1.9014936685562134, + "learning_rate": 9.961159464331119e-06, + "loss": 0.5572, + "num_input_tokens_seen": 3005008, + "step": 4605 + }, + { + "epoch": 2.7181603773584904, + "grad_norm": 2.1144134998321533, + "learning_rate": 9.960838756684161e-06, + "loss": 0.4895, + "num_input_tokens_seen": 3008272, + "step": 4610 + }, + { + "epoch": 2.7211084905660377, + "grad_norm": 1.9594963788986206, + "learning_rate": 9.960516735633764e-06, + "loss": 0.5084, + "num_input_tokens_seen": 3010864, + "step": 4615 + }, + { + "epoch": 2.724056603773585, + "grad_norm": 2.1598689556121826, + "learning_rate": 9.960193401265181e-06, + "loss": 0.5067, + "num_input_tokens_seen": 3013872, + "step": 4620 + }, + { + "epoch": 2.7270047169811322, + "grad_norm": 2.671921968460083, + "learning_rate": 9.959868753664018e-06, + "loss": 0.6774, + "num_input_tokens_seen": 3016912, + "step": 4625 + }, + { + "epoch": 2.7299528301886795, + "grad_norm": 1.183402419090271, + "learning_rate": 9.959542792916227e-06, + "loss": 0.5242, + "num_input_tokens_seen": 3020272, + "step": 4630 + }, + { + "epoch": 2.7329009433962264, + "grad_norm": 1.7900513410568237, + "learning_rate": 9.959215519108108e-06, + "loss": 0.3996, + "num_input_tokens_seen": 3024208, + "step": 4635 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 2.524648666381836, + "learning_rate": 9.958886932326306e-06, + "loss": 0.5381, + "num_input_tokens_seen": 3026960, + "step": 4640 + }, + { + "epoch": 2.7387971698113205, + "grad_norm": 2.692333698272705, + "learning_rate": 9.958557032657817e-06, + "loss": 0.6327, + "num_input_tokens_seen": 3029424, + "step": 4645 + }, + { + "epoch": 2.7417452830188678, + "grad_norm": 1.5379713773727417, + "learning_rate": 9.958225820189984e-06, + "loss": 0.5178, + "num_input_tokens_seen": 3033424, + "step": 4650 + }, + { + "epoch": 2.744693396226415, + "grad_norm": 1.6538691520690918, + "learning_rate": 9.957893295010495e-06, + "loss": 0.4539, + "num_input_tokens_seen": 3036112, + "step": 4655 + }, + { + "epoch": 2.7476415094339623, + "grad_norm": 1.4671686887741089, + "learning_rate": 9.957559457207391e-06, + "loss": 0.4823, + "num_input_tokens_seen": 3041872, + "step": 4660 + }, + { + "epoch": 2.7505896226415096, + "grad_norm": 1.641699194908142, + "learning_rate": 9.957224306869053e-06, + "loss": 0.5121, + "num_input_tokens_seen": 3044720, + "step": 4665 + }, + { + "epoch": 2.7535377358490565, + "grad_norm": 7.5341620445251465, + "learning_rate": 9.956887844084216e-06, + "loss": 0.4737, + "num_input_tokens_seen": 3047440, + "step": 4670 + }, + { + "epoch": 2.7564858490566038, + "grad_norm": 1.8347375392913818, + "learning_rate": 9.956550068941958e-06, + "loss": 0.4098, + "num_input_tokens_seen": 3050256, + "step": 4675 + }, + { + "epoch": 2.759433962264151, + "grad_norm": 2.062375545501709, + "learning_rate": 9.95621098153171e-06, + "loss": 0.4572, + "num_input_tokens_seen": 3053552, + "step": 4680 + }, + { + "epoch": 2.762382075471698, + "grad_norm": 1.9609460830688477, + "learning_rate": 9.955870581943243e-06, + "loss": 0.4695, + "num_input_tokens_seen": 3056304, + "step": 4685 + }, + { + "epoch": 2.765330188679245, + "grad_norm": 6.357059478759766, + "learning_rate": 9.955528870266681e-06, + "loss": 0.5201, + "num_input_tokens_seen": 3058960, + "step": 4690 + }, + { + "epoch": 2.7682783018867925, + "grad_norm": 2.26557993888855, + "learning_rate": 9.955185846592495e-06, + "loss": 0.324, + "num_input_tokens_seen": 3061616, + "step": 4695 + }, + { + "epoch": 2.7712264150943398, + "grad_norm": 1.3830561637878418, + "learning_rate": 9.9548415110115e-06, + "loss": 0.3864, + "num_input_tokens_seen": 3064880, + "step": 4700 + }, + { + "epoch": 2.774174528301887, + "grad_norm": 1.7953895330429077, + "learning_rate": 9.95449586361486e-06, + "loss": 0.4798, + "num_input_tokens_seen": 3067664, + "step": 4705 + }, + { + "epoch": 2.777122641509434, + "grad_norm": 3.8955676555633545, + "learning_rate": 9.954148904494085e-06, + "loss": 0.5824, + "num_input_tokens_seen": 3070064, + "step": 4710 + }, + { + "epoch": 2.780070754716981, + "grad_norm": 1.786041259765625, + "learning_rate": 9.95380063374104e-06, + "loss": 0.5195, + "num_input_tokens_seen": 3072848, + "step": 4715 + }, + { + "epoch": 2.7830188679245285, + "grad_norm": 1.5143654346466064, + "learning_rate": 9.953451051447927e-06, + "loss": 0.5824, + "num_input_tokens_seen": 3077168, + "step": 4720 + }, + { + "epoch": 2.7859669811320753, + "grad_norm": 1.824815273284912, + "learning_rate": 9.953100157707299e-06, + "loss": 0.5513, + "num_input_tokens_seen": 3080496, + "step": 4725 + }, + { + "epoch": 2.7889150943396226, + "grad_norm": 4.569231033325195, + "learning_rate": 9.952747952612056e-06, + "loss": 0.6045, + "num_input_tokens_seen": 3082608, + "step": 4730 + }, + { + "epoch": 2.79186320754717, + "grad_norm": 2.402606248855591, + "learning_rate": 9.952394436255451e-06, + "loss": 0.6115, + "num_input_tokens_seen": 3086128, + "step": 4735 + }, + { + "epoch": 2.794811320754717, + "grad_norm": 1.223108172416687, + "learning_rate": 9.952039608731072e-06, + "loss": 0.5221, + "num_input_tokens_seen": 3089424, + "step": 4740 + }, + { + "epoch": 2.797759433962264, + "grad_norm": 1.2881407737731934, + "learning_rate": 9.951683470132868e-06, + "loss": 0.4511, + "num_input_tokens_seen": 3092528, + "step": 4745 + }, + { + "epoch": 2.8007075471698113, + "grad_norm": 1.4854148626327515, + "learning_rate": 9.951326020555122e-06, + "loss": 0.5285, + "num_input_tokens_seen": 3095696, + "step": 4750 + }, + { + "epoch": 2.8036556603773586, + "grad_norm": 3.1883795261383057, + "learning_rate": 9.950967260092473e-06, + "loss": 0.5104, + "num_input_tokens_seen": 3098928, + "step": 4755 + }, + { + "epoch": 2.8066037735849054, + "grad_norm": 2.4417901039123535, + "learning_rate": 9.950607188839905e-06, + "loss": 0.538, + "num_input_tokens_seen": 3103024, + "step": 4760 + }, + { + "epoch": 2.8095518867924527, + "grad_norm": 1.0449641942977905, + "learning_rate": 9.950245806892749e-06, + "loss": 0.4575, + "num_input_tokens_seen": 3106480, + "step": 4765 + }, + { + "epoch": 2.8125, + "grad_norm": 1.4901535511016846, + "learning_rate": 9.94988311434668e-06, + "loss": 0.4051, + "num_input_tokens_seen": 3109840, + "step": 4770 + }, + { + "epoch": 2.8154481132075473, + "grad_norm": 2.4330759048461914, + "learning_rate": 9.949519111297723e-06, + "loss": 0.4717, + "num_input_tokens_seen": 3113072, + "step": 4775 + }, + { + "epoch": 2.8183962264150946, + "grad_norm": 1.079999327659607, + "learning_rate": 9.949153797842252e-06, + "loss": 0.4592, + "num_input_tokens_seen": 3116304, + "step": 4780 + }, + { + "epoch": 2.8213443396226414, + "grad_norm": 1.1438630819320679, + "learning_rate": 9.948787174076982e-06, + "loss": 0.4717, + "num_input_tokens_seen": 3119152, + "step": 4785 + }, + { + "epoch": 2.8242924528301887, + "grad_norm": 1.6952239274978638, + "learning_rate": 9.948419240098978e-06, + "loss": 0.4545, + "num_input_tokens_seen": 3122160, + "step": 4790 + }, + { + "epoch": 2.827240566037736, + "grad_norm": 1.4051077365875244, + "learning_rate": 9.948049996005657e-06, + "loss": 0.3353, + "num_input_tokens_seen": 3125296, + "step": 4795 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 1.6469001770019531, + "learning_rate": 9.947679441894773e-06, + "loss": 0.4559, + "num_input_tokens_seen": 3128496, + "step": 4800 + }, + { + "epoch": 2.83313679245283, + "grad_norm": 3.1621716022491455, + "learning_rate": 9.947307577864433e-06, + "loss": 0.5985, + "num_input_tokens_seen": 3132272, + "step": 4805 + }, + { + "epoch": 2.8360849056603774, + "grad_norm": 2.071216344833374, + "learning_rate": 9.94693440401309e-06, + "loss": 0.5325, + "num_input_tokens_seen": 3135280, + "step": 4810 + }, + { + "epoch": 2.8390330188679247, + "grad_norm": 2.6229159832000732, + "learning_rate": 9.946559920439545e-06, + "loss": 0.4734, + "num_input_tokens_seen": 3138352, + "step": 4815 + }, + { + "epoch": 2.8419811320754715, + "grad_norm": 2.319192886352539, + "learning_rate": 9.946184127242942e-06, + "loss": 0.4258, + "num_input_tokens_seen": 3141520, + "step": 4820 + }, + { + "epoch": 2.844929245283019, + "grad_norm": 1.5826201438903809, + "learning_rate": 9.945807024522774e-06, + "loss": 0.3801, + "num_input_tokens_seen": 3143952, + "step": 4825 + }, + { + "epoch": 2.847877358490566, + "grad_norm": 1.9659805297851562, + "learning_rate": 9.945428612378881e-06, + "loss": 0.5233, + "num_input_tokens_seen": 3146736, + "step": 4830 + }, + { + "epoch": 2.850825471698113, + "grad_norm": 3.3256261348724365, + "learning_rate": 9.94504889091145e-06, + "loss": 0.6951, + "num_input_tokens_seen": 3148944, + "step": 4835 + }, + { + "epoch": 2.8537735849056602, + "grad_norm": 7.192554473876953, + "learning_rate": 9.944667860221013e-06, + "loss": 0.4822, + "num_input_tokens_seen": 3152432, + "step": 4840 + }, + { + "epoch": 2.8567216981132075, + "grad_norm": 2.0407843589782715, + "learning_rate": 9.944285520408448e-06, + "loss": 0.4719, + "num_input_tokens_seen": 3156688, + "step": 4845 + }, + { + "epoch": 2.859669811320755, + "grad_norm": 1.4957295656204224, + "learning_rate": 9.943901871574984e-06, + "loss": 0.4947, + "num_input_tokens_seen": 3159408, + "step": 4850 + }, + { + "epoch": 2.862617924528302, + "grad_norm": 3.536105155944824, + "learning_rate": 9.943516913822192e-06, + "loss": 0.5738, + "num_input_tokens_seen": 3163248, + "step": 4855 + }, + { + "epoch": 2.865566037735849, + "grad_norm": 1.4576832056045532, + "learning_rate": 9.943130647251994e-06, + "loss": 0.4385, + "num_input_tokens_seen": 3165584, + "step": 4860 + }, + { + "epoch": 2.8685141509433962, + "grad_norm": 2.1835362911224365, + "learning_rate": 9.94274307196665e-06, + "loss": 0.3143, + "num_input_tokens_seen": 3168656, + "step": 4865 + }, + { + "epoch": 2.8714622641509435, + "grad_norm": 3.3358004093170166, + "learning_rate": 9.942354188068778e-06, + "loss": 0.363, + "num_input_tokens_seen": 3172048, + "step": 4870 + }, + { + "epoch": 2.8744103773584904, + "grad_norm": 2.5637922286987305, + "learning_rate": 9.941963995661333e-06, + "loss": 0.6071, + "num_input_tokens_seen": 3174896, + "step": 4875 + }, + { + "epoch": 2.8773584905660377, + "grad_norm": 1.916135311126709, + "learning_rate": 9.941572494847622e-06, + "loss": 0.5665, + "num_input_tokens_seen": 3177488, + "step": 4880 + }, + { + "epoch": 2.880306603773585, + "grad_norm": 3.585164785385132, + "learning_rate": 9.941179685731297e-06, + "loss": 0.454, + "num_input_tokens_seen": 3179440, + "step": 4885 + }, + { + "epoch": 2.8832547169811322, + "grad_norm": 1.5545594692230225, + "learning_rate": 9.940785568416354e-06, + "loss": 0.4224, + "num_input_tokens_seen": 3182896, + "step": 4890 + }, + { + "epoch": 2.8862028301886795, + "grad_norm": 3.028679847717285, + "learning_rate": 9.940390143007137e-06, + "loss": 0.4948, + "num_input_tokens_seen": 3186320, + "step": 4895 + }, + { + "epoch": 2.8891509433962264, + "grad_norm": 1.2530959844589233, + "learning_rate": 9.939993409608339e-06, + "loss": 0.5518, + "num_input_tokens_seen": 3189200, + "step": 4900 + }, + { + "epoch": 2.8920990566037736, + "grad_norm": 1.295553207397461, + "learning_rate": 9.939595368324996e-06, + "loss": 0.474, + "num_input_tokens_seen": 3193104, + "step": 4905 + }, + { + "epoch": 2.8950471698113205, + "grad_norm": 1.7203960418701172, + "learning_rate": 9.93919601926249e-06, + "loss": 0.3859, + "num_input_tokens_seen": 3195920, + "step": 4910 + }, + { + "epoch": 2.8979952830188678, + "grad_norm": 1.2748231887817383, + "learning_rate": 9.938795362526552e-06, + "loss": 0.4464, + "num_input_tokens_seen": 3200848, + "step": 4915 + }, + { + "epoch": 2.900943396226415, + "grad_norm": 4.31773567199707, + "learning_rate": 9.938393398223255e-06, + "loss": 0.474, + "num_input_tokens_seen": 3203984, + "step": 4920 + }, + { + "epoch": 2.9038915094339623, + "grad_norm": 1.6798877716064453, + "learning_rate": 9.937990126459024e-06, + "loss": 0.4361, + "num_input_tokens_seen": 3207824, + "step": 4925 + }, + { + "epoch": 2.9068396226415096, + "grad_norm": 2.577207326889038, + "learning_rate": 9.937585547340624e-06, + "loss": 0.5225, + "num_input_tokens_seen": 3210992, + "step": 4930 + }, + { + "epoch": 2.9097877358490565, + "grad_norm": 1.8575819730758667, + "learning_rate": 9.937179660975174e-06, + "loss": 0.3623, + "num_input_tokens_seen": 3214544, + "step": 4935 + }, + { + "epoch": 2.9127358490566038, + "grad_norm": 2.627897024154663, + "learning_rate": 9.936772467470127e-06, + "loss": 0.4696, + "num_input_tokens_seen": 3217424, + "step": 4940 + }, + { + "epoch": 2.915683962264151, + "grad_norm": 1.8516827821731567, + "learning_rate": 9.936363966933294e-06, + "loss": 0.6843, + "num_input_tokens_seen": 3220400, + "step": 4945 + }, + { + "epoch": 2.918632075471698, + "grad_norm": 1.6161060333251953, + "learning_rate": 9.935954159472828e-06, + "loss": 0.375, + "num_input_tokens_seen": 3223408, + "step": 4950 + }, + { + "epoch": 2.921580188679245, + "grad_norm": 1.6432689428329468, + "learning_rate": 9.935543045197222e-06, + "loss": 0.418, + "num_input_tokens_seen": 3226288, + "step": 4955 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 2.416006565093994, + "learning_rate": 9.935130624215326e-06, + "loss": 0.5549, + "num_input_tokens_seen": 3229296, + "step": 4960 + }, + { + "epoch": 2.9274764150943398, + "grad_norm": 1.7391762733459473, + "learning_rate": 9.934716896636329e-06, + "loss": 0.5579, + "num_input_tokens_seen": 3232048, + "step": 4965 + }, + { + "epoch": 2.930424528301887, + "grad_norm": 3.599201202392578, + "learning_rate": 9.934301862569764e-06, + "loss": 0.6727, + "num_input_tokens_seen": 3234832, + "step": 4970 + }, + { + "epoch": 2.933372641509434, + "grad_norm": 8.448949813842773, + "learning_rate": 9.933885522125517e-06, + "loss": 0.6517, + "num_input_tokens_seen": 3237648, + "step": 4975 + }, + { + "epoch": 2.936320754716981, + "grad_norm": 1.8905072212219238, + "learning_rate": 9.933467875413813e-06, + "loss": 0.6168, + "num_input_tokens_seen": 3240944, + "step": 4980 + }, + { + "epoch": 2.9392688679245285, + "grad_norm": 0.8193274140357971, + "learning_rate": 9.933048922545227e-06, + "loss": 0.5366, + "num_input_tokens_seen": 3244016, + "step": 4985 + }, + { + "epoch": 2.9422169811320753, + "grad_norm": 1.5942493677139282, + "learning_rate": 9.932628663630679e-06, + "loss": 0.4305, + "num_input_tokens_seen": 3246576, + "step": 4990 + }, + { + "epoch": 2.9451650943396226, + "grad_norm": 3.5463485717773438, + "learning_rate": 9.932207098781432e-06, + "loss": 0.6666, + "num_input_tokens_seen": 3250160, + "step": 4995 + }, + { + "epoch": 2.94811320754717, + "grad_norm": 1.8341648578643799, + "learning_rate": 9.931784228109102e-06, + "loss": 0.5147, + "num_input_tokens_seen": 3253296, + "step": 5000 + }, + { + "epoch": 2.951061320754717, + "grad_norm": 2.0751190185546875, + "learning_rate": 9.93136005172564e-06, + "loss": 0.459, + "num_input_tokens_seen": 3256496, + "step": 5005 + }, + { + "epoch": 2.954009433962264, + "grad_norm": 1.4762810468673706, + "learning_rate": 9.930934569743354e-06, + "loss": 0.4601, + "num_input_tokens_seen": 3259280, + "step": 5010 + }, + { + "epoch": 2.9569575471698113, + "grad_norm": 1.3667514324188232, + "learning_rate": 9.930507782274888e-06, + "loss": 0.5588, + "num_input_tokens_seen": 3264144, + "step": 5015 + }, + { + "epoch": 2.9599056603773586, + "grad_norm": 2.6102676391601562, + "learning_rate": 9.930079689433236e-06, + "loss": 0.427, + "num_input_tokens_seen": 3267440, + "step": 5020 + }, + { + "epoch": 2.9628537735849054, + "grad_norm": 1.4854927062988281, + "learning_rate": 9.92965029133174e-06, + "loss": 0.5439, + "num_input_tokens_seen": 3271760, + "step": 5025 + }, + { + "epoch": 2.9658018867924527, + "grad_norm": 2.2654364109039307, + "learning_rate": 9.929219588084084e-06, + "loss": 0.4858, + "num_input_tokens_seen": 3275216, + "step": 5030 + }, + { + "epoch": 2.96875, + "grad_norm": 1.452226161956787, + "learning_rate": 9.9287875798043e-06, + "loss": 0.4602, + "num_input_tokens_seen": 3278992, + "step": 5035 + }, + { + "epoch": 2.9716981132075473, + "grad_norm": 4.526086807250977, + "learning_rate": 9.92835426660676e-06, + "loss": 0.5581, + "num_input_tokens_seen": 3281776, + "step": 5040 + }, + { + "epoch": 2.9746462264150946, + "grad_norm": 1.8233877420425415, + "learning_rate": 9.927919648606188e-06, + "loss": 0.4128, + "num_input_tokens_seen": 3284304, + "step": 5045 + }, + { + "epoch": 2.9775943396226414, + "grad_norm": 1.2043710947036743, + "learning_rate": 9.927483725917652e-06, + "loss": 0.3332, + "num_input_tokens_seen": 3286896, + "step": 5050 + }, + { + "epoch": 2.9805424528301887, + "grad_norm": 6.421375751495361, + "learning_rate": 9.927046498656562e-06, + "loss": 0.5722, + "num_input_tokens_seen": 3290128, + "step": 5055 + }, + { + "epoch": 2.983490566037736, + "grad_norm": 2.4058752059936523, + "learning_rate": 9.926607966938679e-06, + "loss": 0.597, + "num_input_tokens_seen": 3293520, + "step": 5060 + }, + { + "epoch": 2.986438679245283, + "grad_norm": 1.6264110803604126, + "learning_rate": 9.926168130880103e-06, + "loss": 0.4408, + "num_input_tokens_seen": 3296624, + "step": 5065 + }, + { + "epoch": 2.98938679245283, + "grad_norm": 2.234410524368286, + "learning_rate": 9.925726990597283e-06, + "loss": 0.5116, + "num_input_tokens_seen": 3300080, + "step": 5070 + }, + { + "epoch": 2.9923349056603774, + "grad_norm": 2.243040084838867, + "learning_rate": 9.925284546207015e-06, + "loss": 0.5711, + "num_input_tokens_seen": 3303280, + "step": 5075 + }, + { + "epoch": 2.9952830188679247, + "grad_norm": 4.0567731857299805, + "learning_rate": 9.924840797826436e-06, + "loss": 0.524, + "num_input_tokens_seen": 3306128, + "step": 5080 + }, + { + "epoch": 2.9982311320754715, + "grad_norm": 1.8744608163833618, + "learning_rate": 9.924395745573029e-06, + "loss": 0.644, + "num_input_tokens_seen": 3309264, + "step": 5085 + }, + { + "epoch": 3.001179245283019, + "grad_norm": 1.2985773086547852, + "learning_rate": 9.923949389564629e-06, + "loss": 0.572, + "num_input_tokens_seen": 3312280, + "step": 5090 + }, + { + "epoch": 3.004127358490566, + "grad_norm": 2.911729574203491, + "learning_rate": 9.923501729919404e-06, + "loss": 0.5164, + "num_input_tokens_seen": 3314968, + "step": 5095 + }, + { + "epoch": 3.0070754716981134, + "grad_norm": 1.4047752618789673, + "learning_rate": 9.923052766755878e-06, + "loss": 0.4092, + "num_input_tokens_seen": 3318392, + "step": 5100 + }, + { + "epoch": 3.0100235849056602, + "grad_norm": 1.6989785432815552, + "learning_rate": 9.922602500192914e-06, + "loss": 0.5324, + "num_input_tokens_seen": 3321144, + "step": 5105 + }, + { + "epoch": 3.0129716981132075, + "grad_norm": 1.2788058519363403, + "learning_rate": 9.922150930349725e-06, + "loss": 0.4799, + "num_input_tokens_seen": 3323672, + "step": 5110 + }, + { + "epoch": 3.015919811320755, + "grad_norm": 1.490939974784851, + "learning_rate": 9.921698057345863e-06, + "loss": 0.3751, + "num_input_tokens_seen": 3327064, + "step": 5115 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 1.3321319818496704, + "learning_rate": 9.921243881301229e-06, + "loss": 0.499, + "num_input_tokens_seen": 3330840, + "step": 5120 + }, + { + "epoch": 3.021816037735849, + "grad_norm": 2.367957353591919, + "learning_rate": 9.920788402336068e-06, + "loss": 0.3978, + "num_input_tokens_seen": 3334616, + "step": 5125 + }, + { + "epoch": 3.0247641509433962, + "grad_norm": 1.01689612865448, + "learning_rate": 9.92033162057097e-06, + "loss": 0.4086, + "num_input_tokens_seen": 3337944, + "step": 5130 + }, + { + "epoch": 3.0277122641509435, + "grad_norm": 1.487201452255249, + "learning_rate": 9.919873536126869e-06, + "loss": 0.4396, + "num_input_tokens_seen": 3341272, + "step": 5135 + }, + { + "epoch": 3.0306603773584904, + "grad_norm": 1.277281641960144, + "learning_rate": 9.919414149125046e-06, + "loss": 0.4916, + "num_input_tokens_seen": 3344344, + "step": 5140 + }, + { + "epoch": 3.0336084905660377, + "grad_norm": 1.537855863571167, + "learning_rate": 9.918953459687126e-06, + "loss": 0.4229, + "num_input_tokens_seen": 3347192, + "step": 5145 + }, + { + "epoch": 3.036556603773585, + "grad_norm": 1.5782262086868286, + "learning_rate": 9.918491467935078e-06, + "loss": 0.4529, + "num_input_tokens_seen": 3351224, + "step": 5150 + }, + { + "epoch": 3.0395047169811322, + "grad_norm": 2.3634634017944336, + "learning_rate": 9.918028173991218e-06, + "loss": 0.4174, + "num_input_tokens_seen": 3354072, + "step": 5155 + }, + { + "epoch": 3.042452830188679, + "grad_norm": 1.8541532754898071, + "learning_rate": 9.917563577978202e-06, + "loss": 0.4613, + "num_input_tokens_seen": 3356856, + "step": 5160 + }, + { + "epoch": 3.0454009433962264, + "grad_norm": 0.9669644832611084, + "learning_rate": 9.917097680019035e-06, + "loss": 0.4816, + "num_input_tokens_seen": 3360280, + "step": 5165 + }, + { + "epoch": 3.0483490566037736, + "grad_norm": 2.462339401245117, + "learning_rate": 9.916630480237066e-06, + "loss": 0.4723, + "num_input_tokens_seen": 3362968, + "step": 5170 + }, + { + "epoch": 3.051297169811321, + "grad_norm": 1.8991142511367798, + "learning_rate": 9.916161978755988e-06, + "loss": 0.4736, + "num_input_tokens_seen": 3366648, + "step": 5175 + }, + { + "epoch": 3.0542452830188678, + "grad_norm": 2.372828722000122, + "learning_rate": 9.915692175699838e-06, + "loss": 0.4842, + "num_input_tokens_seen": 3369304, + "step": 5180 + }, + { + "epoch": 3.057193396226415, + "grad_norm": 2.6835107803344727, + "learning_rate": 9.915221071193e-06, + "loss": 0.5507, + "num_input_tokens_seen": 3372504, + "step": 5185 + }, + { + "epoch": 3.0601415094339623, + "grad_norm": 2.030510663986206, + "learning_rate": 9.914748665360199e-06, + "loss": 0.4882, + "num_input_tokens_seen": 3375096, + "step": 5190 + }, + { + "epoch": 3.0630896226415096, + "grad_norm": 10.450382232666016, + "learning_rate": 9.914274958326507e-06, + "loss": 0.6181, + "num_input_tokens_seen": 3377592, + "step": 5195 + }, + { + "epoch": 3.0660377358490565, + "grad_norm": 1.533773422241211, + "learning_rate": 9.913799950217341e-06, + "loss": 0.5641, + "num_input_tokens_seen": 3380728, + "step": 5200 + }, + { + "epoch": 3.0689858490566038, + "grad_norm": 3.3019399642944336, + "learning_rate": 9.91332364115846e-06, + "loss": 0.4957, + "num_input_tokens_seen": 3383416, + "step": 5205 + }, + { + "epoch": 3.071933962264151, + "grad_norm": 1.7606898546218872, + "learning_rate": 9.912846031275972e-06, + "loss": 0.5678, + "num_input_tokens_seen": 3386904, + "step": 5210 + }, + { + "epoch": 3.074882075471698, + "grad_norm": 2.5911142826080322, + "learning_rate": 9.912367120696322e-06, + "loss": 0.403, + "num_input_tokens_seen": 3389496, + "step": 5215 + }, + { + "epoch": 3.077830188679245, + "grad_norm": 2.262333631515503, + "learning_rate": 9.911886909546307e-06, + "loss": 0.4631, + "num_input_tokens_seen": 3392856, + "step": 5220 + }, + { + "epoch": 3.0807783018867925, + "grad_norm": 1.7548032999038696, + "learning_rate": 9.911405397953063e-06, + "loss": 0.4369, + "num_input_tokens_seen": 3396280, + "step": 5225 + }, + { + "epoch": 3.0837264150943398, + "grad_norm": 1.9484949111938477, + "learning_rate": 9.910922586044073e-06, + "loss": 0.4041, + "num_input_tokens_seen": 3400216, + "step": 5230 + }, + { + "epoch": 3.0866745283018866, + "grad_norm": 4.740269660949707, + "learning_rate": 9.910438473947163e-06, + "loss": 0.4477, + "num_input_tokens_seen": 3402744, + "step": 5235 + }, + { + "epoch": 3.089622641509434, + "grad_norm": 1.7629338502883911, + "learning_rate": 9.909953061790506e-06, + "loss": 0.6051, + "num_input_tokens_seen": 3405816, + "step": 5240 + }, + { + "epoch": 3.092570754716981, + "grad_norm": 2.18304705619812, + "learning_rate": 9.909466349702613e-06, + "loss": 0.5483, + "num_input_tokens_seen": 3408824, + "step": 5245 + }, + { + "epoch": 3.0955188679245285, + "grad_norm": 2.186913013458252, + "learning_rate": 9.908978337812348e-06, + "loss": 0.4303, + "num_input_tokens_seen": 3412504, + "step": 5250 + }, + { + "epoch": 3.0984669811320753, + "grad_norm": 1.807883620262146, + "learning_rate": 9.908489026248909e-06, + "loss": 0.6035, + "num_input_tokens_seen": 3415192, + "step": 5255 + }, + { + "epoch": 3.1014150943396226, + "grad_norm": 2.9743282794952393, + "learning_rate": 9.907998415141846e-06, + "loss": 0.4859, + "num_input_tokens_seen": 3418456, + "step": 5260 + }, + { + "epoch": 3.10436320754717, + "grad_norm": 1.8896187543869019, + "learning_rate": 9.907506504621052e-06, + "loss": 0.4454, + "num_input_tokens_seen": 3421560, + "step": 5265 + }, + { + "epoch": 3.107311320754717, + "grad_norm": 6.483303546905518, + "learning_rate": 9.907013294816759e-06, + "loss": 0.5021, + "num_input_tokens_seen": 3425272, + "step": 5270 + }, + { + "epoch": 3.110259433962264, + "grad_norm": 1.8346004486083984, + "learning_rate": 9.906518785859548e-06, + "loss": 0.4863, + "num_input_tokens_seen": 3428216, + "step": 5275 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 1.776559591293335, + "learning_rate": 9.906022977880344e-06, + "loss": 0.3763, + "num_input_tokens_seen": 3431032, + "step": 5280 + }, + { + "epoch": 3.1161556603773586, + "grad_norm": 1.3047335147857666, + "learning_rate": 9.905525871010412e-06, + "loss": 0.3967, + "num_input_tokens_seen": 3434104, + "step": 5285 + }, + { + "epoch": 3.119103773584906, + "grad_norm": 2.0682482719421387, + "learning_rate": 9.905027465381363e-06, + "loss": 0.4336, + "num_input_tokens_seen": 3436824, + "step": 5290 + }, + { + "epoch": 3.1220518867924527, + "grad_norm": 4.566401481628418, + "learning_rate": 9.904527761125155e-06, + "loss": 0.6082, + "num_input_tokens_seen": 3439352, + "step": 5295 + }, + { + "epoch": 3.125, + "grad_norm": 1.373267650604248, + "learning_rate": 9.904026758374083e-06, + "loss": 0.381, + "num_input_tokens_seen": 3442680, + "step": 5300 + }, + { + "epoch": 3.1279481132075473, + "grad_norm": 3.013223648071289, + "learning_rate": 9.903524457260794e-06, + "loss": 0.4136, + "num_input_tokens_seen": 3445944, + "step": 5305 + }, + { + "epoch": 3.1308962264150946, + "grad_norm": 1.5326875448226929, + "learning_rate": 9.90302085791827e-06, + "loss": 0.4896, + "num_input_tokens_seen": 3449080, + "step": 5310 + }, + { + "epoch": 3.1338443396226414, + "grad_norm": 1.798431158065796, + "learning_rate": 9.902515960479844e-06, + "loss": 0.5089, + "num_input_tokens_seen": 3453208, + "step": 5315 + }, + { + "epoch": 3.1367924528301887, + "grad_norm": 2.233085870742798, + "learning_rate": 9.902009765079188e-06, + "loss": 0.5379, + "num_input_tokens_seen": 3456856, + "step": 5320 + }, + { + "epoch": 3.139740566037736, + "grad_norm": 2.2206714153289795, + "learning_rate": 9.90150227185032e-06, + "loss": 0.462, + "num_input_tokens_seen": 3459672, + "step": 5325 + }, + { + "epoch": 3.142688679245283, + "grad_norm": 2.3948395252227783, + "learning_rate": 9.900993480927603e-06, + "loss": 0.5942, + "num_input_tokens_seen": 3462040, + "step": 5330 + }, + { + "epoch": 3.14563679245283, + "grad_norm": 2.5827414989471436, + "learning_rate": 9.90048339244574e-06, + "loss": 0.5012, + "num_input_tokens_seen": 3464856, + "step": 5335 + }, + { + "epoch": 3.1485849056603774, + "grad_norm": 2.1674084663391113, + "learning_rate": 9.899972006539776e-06, + "loss": 0.4121, + "num_input_tokens_seen": 3467928, + "step": 5340 + }, + { + "epoch": 3.1515330188679247, + "grad_norm": 2.314133882522583, + "learning_rate": 9.899459323345106e-06, + "loss": 0.5219, + "num_input_tokens_seen": 3471096, + "step": 5345 + }, + { + "epoch": 3.1544811320754715, + "grad_norm": 2.3987629413604736, + "learning_rate": 9.898945342997467e-06, + "loss": 0.4654, + "num_input_tokens_seen": 3474808, + "step": 5350 + }, + { + "epoch": 3.157429245283019, + "grad_norm": 2.3828718662261963, + "learning_rate": 9.898430065632933e-06, + "loss": 0.3936, + "num_input_tokens_seen": 3478296, + "step": 5355 + }, + { + "epoch": 3.160377358490566, + "grad_norm": 1.3809982538223267, + "learning_rate": 9.897913491387929e-06, + "loss": 0.4716, + "num_input_tokens_seen": 3481304, + "step": 5360 + }, + { + "epoch": 3.1633254716981134, + "grad_norm": 1.3537770509719849, + "learning_rate": 9.897395620399219e-06, + "loss": 0.5651, + "num_input_tokens_seen": 3486200, + "step": 5365 + }, + { + "epoch": 3.1662735849056602, + "grad_norm": 2.0660784244537354, + "learning_rate": 9.896876452803913e-06, + "loss": 0.5685, + "num_input_tokens_seen": 3489784, + "step": 5370 + }, + { + "epoch": 3.1692216981132075, + "grad_norm": 1.3299871683120728, + "learning_rate": 9.896355988739461e-06, + "loss": 0.4432, + "num_input_tokens_seen": 3492824, + "step": 5375 + }, + { + "epoch": 3.172169811320755, + "grad_norm": 1.5161328315734863, + "learning_rate": 9.895834228343658e-06, + "loss": 0.5175, + "num_input_tokens_seen": 3495736, + "step": 5380 + }, + { + "epoch": 3.175117924528302, + "grad_norm": 2.922492027282715, + "learning_rate": 9.895311171754644e-06, + "loss": 0.5193, + "num_input_tokens_seen": 3498616, + "step": 5385 + }, + { + "epoch": 3.178066037735849, + "grad_norm": 2.018202066421509, + "learning_rate": 9.8947868191109e-06, + "loss": 0.5631, + "num_input_tokens_seen": 3501976, + "step": 5390 + }, + { + "epoch": 3.1810141509433962, + "grad_norm": 3.446112632751465, + "learning_rate": 9.894261170551249e-06, + "loss": 0.4348, + "num_input_tokens_seen": 3507096, + "step": 5395 + }, + { + "epoch": 3.1839622641509435, + "grad_norm": 1.0192890167236328, + "learning_rate": 9.893734226214861e-06, + "loss": 0.4042, + "num_input_tokens_seen": 3510776, + "step": 5400 + }, + { + "epoch": 3.1869103773584904, + "grad_norm": 8.99704647064209, + "learning_rate": 9.893205986241246e-06, + "loss": 0.4858, + "num_input_tokens_seen": 3514712, + "step": 5405 + }, + { + "epoch": 3.1898584905660377, + "grad_norm": 0.9235435724258423, + "learning_rate": 9.892676450770257e-06, + "loss": 0.3711, + "num_input_tokens_seen": 3517688, + "step": 5410 + }, + { + "epoch": 3.192806603773585, + "grad_norm": 1.7647525072097778, + "learning_rate": 9.892145619942092e-06, + "loss": 0.3671, + "num_input_tokens_seen": 3520760, + "step": 5415 + }, + { + "epoch": 3.1957547169811322, + "grad_norm": 2.5886619091033936, + "learning_rate": 9.891613493897289e-06, + "loss": 0.4361, + "num_input_tokens_seen": 3523672, + "step": 5420 + }, + { + "epoch": 3.198702830188679, + "grad_norm": 2.938159704208374, + "learning_rate": 9.891080072776733e-06, + "loss": 0.6848, + "num_input_tokens_seen": 3526552, + "step": 5425 + }, + { + "epoch": 3.2016509433962264, + "grad_norm": 1.8357021808624268, + "learning_rate": 9.890545356721649e-06, + "loss": 0.5061, + "num_input_tokens_seen": 3530552, + "step": 5430 + }, + { + "epoch": 3.2045990566037736, + "grad_norm": 3.6945109367370605, + "learning_rate": 9.890009345873603e-06, + "loss": 0.4507, + "num_input_tokens_seen": 3533656, + "step": 5435 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 1.415722131729126, + "learning_rate": 9.889472040374509e-06, + "loss": 0.3722, + "num_input_tokens_seen": 3536632, + "step": 5440 + }, + { + "epoch": 3.2104952830188678, + "grad_norm": 2.3702099323272705, + "learning_rate": 9.88893344036662e-06, + "loss": 0.4766, + "num_input_tokens_seen": 3539256, + "step": 5445 + }, + { + "epoch": 3.213443396226415, + "grad_norm": 4.88722038269043, + "learning_rate": 9.888393545992531e-06, + "loss": 0.5999, + "num_input_tokens_seen": 3541624, + "step": 5450 + }, + { + "epoch": 3.2163915094339623, + "grad_norm": 0.7700355648994446, + "learning_rate": 9.887852357395184e-06, + "loss": 0.3474, + "num_input_tokens_seen": 3545272, + "step": 5455 + }, + { + "epoch": 3.2193396226415096, + "grad_norm": 2.5676627159118652, + "learning_rate": 9.88730987471786e-06, + "loss": 0.4934, + "num_input_tokens_seen": 3548760, + "step": 5460 + }, + { + "epoch": 3.2222877358490565, + "grad_norm": 1.616018295288086, + "learning_rate": 9.886766098104183e-06, + "loss": 0.5045, + "num_input_tokens_seen": 3553208, + "step": 5465 + }, + { + "epoch": 3.2252358490566038, + "grad_norm": 1.3842121362686157, + "learning_rate": 9.886221027698122e-06, + "loss": 0.4052, + "num_input_tokens_seen": 3555800, + "step": 5470 + }, + { + "epoch": 3.228183962264151, + "grad_norm": 1.9846680164337158, + "learning_rate": 9.885674663643983e-06, + "loss": 0.6302, + "num_input_tokens_seen": 3558616, + "step": 5475 + }, + { + "epoch": 3.231132075471698, + "grad_norm": 1.8711097240447998, + "learning_rate": 9.885127006086423e-06, + "loss": 0.734, + "num_input_tokens_seen": 3562264, + "step": 5480 + }, + { + "epoch": 3.234080188679245, + "grad_norm": 2.813077449798584, + "learning_rate": 9.884578055170434e-06, + "loss": 0.5638, + "num_input_tokens_seen": 3565912, + "step": 5485 + }, + { + "epoch": 3.2370283018867925, + "grad_norm": 1.1934574842453003, + "learning_rate": 9.884027811041353e-06, + "loss": 0.4199, + "num_input_tokens_seen": 3570200, + "step": 5490 + }, + { + "epoch": 3.2399764150943398, + "grad_norm": 2.346193313598633, + "learning_rate": 9.883476273844861e-06, + "loss": 0.5261, + "num_input_tokens_seen": 3572952, + "step": 5495 + }, + { + "epoch": 3.2429245283018866, + "grad_norm": 1.7948797941207886, + "learning_rate": 9.882923443726977e-06, + "loss": 0.3781, + "num_input_tokens_seen": 3575800, + "step": 5500 + }, + { + "epoch": 3.245872641509434, + "grad_norm": 1.5508087873458862, + "learning_rate": 9.882369320834068e-06, + "loss": 0.4212, + "num_input_tokens_seen": 3579256, + "step": 5505 + }, + { + "epoch": 3.248820754716981, + "grad_norm": 5.076117515563965, + "learning_rate": 9.88181390531284e-06, + "loss": 0.4401, + "num_input_tokens_seen": 3583160, + "step": 5510 + }, + { + "epoch": 3.2517688679245285, + "grad_norm": 1.8209553956985474, + "learning_rate": 9.88125719731034e-06, + "loss": 0.4397, + "num_input_tokens_seen": 3587448, + "step": 5515 + }, + { + "epoch": 3.2547169811320753, + "grad_norm": 1.63091242313385, + "learning_rate": 9.880699196973962e-06, + "loss": 0.4377, + "num_input_tokens_seen": 3591160, + "step": 5520 + }, + { + "epoch": 3.2576650943396226, + "grad_norm": 1.5736274719238281, + "learning_rate": 9.880139904451436e-06, + "loss": 0.4794, + "num_input_tokens_seen": 3593880, + "step": 5525 + }, + { + "epoch": 3.26061320754717, + "grad_norm": 1.3822582960128784, + "learning_rate": 9.879579319890838e-06, + "loss": 0.4034, + "num_input_tokens_seen": 3596792, + "step": 5530 + }, + { + "epoch": 3.263561320754717, + "grad_norm": 2.072382688522339, + "learning_rate": 9.879017443440584e-06, + "loss": 0.4078, + "num_input_tokens_seen": 3599928, + "step": 5535 + }, + { + "epoch": 3.266509433962264, + "grad_norm": 0.9391538500785828, + "learning_rate": 9.878454275249436e-06, + "loss": 0.4507, + "num_input_tokens_seen": 3602872, + "step": 5540 + }, + { + "epoch": 3.2694575471698113, + "grad_norm": 4.76472282409668, + "learning_rate": 9.877889815466493e-06, + "loss": 0.4563, + "num_input_tokens_seen": 3606328, + "step": 5545 + }, + { + "epoch": 3.2724056603773586, + "grad_norm": 1.3194336891174316, + "learning_rate": 9.877324064241198e-06, + "loss": 0.4057, + "num_input_tokens_seen": 3609400, + "step": 5550 + }, + { + "epoch": 3.2753537735849054, + "grad_norm": 3.9348933696746826, + "learning_rate": 9.876757021723338e-06, + "loss": 0.5476, + "num_input_tokens_seen": 3612216, + "step": 5555 + }, + { + "epoch": 3.2783018867924527, + "grad_norm": 1.4248347282409668, + "learning_rate": 9.876188688063038e-06, + "loss": 0.3909, + "num_input_tokens_seen": 3616408, + "step": 5560 + }, + { + "epoch": 3.28125, + "grad_norm": 3.1484227180480957, + "learning_rate": 9.875619063410768e-06, + "loss": 0.4765, + "num_input_tokens_seen": 3620216, + "step": 5565 + }, + { + "epoch": 3.2841981132075473, + "grad_norm": 1.2532635927200317, + "learning_rate": 9.875048147917339e-06, + "loss": 0.6603, + "num_input_tokens_seen": 3623384, + "step": 5570 + }, + { + "epoch": 3.2871462264150946, + "grad_norm": 1.9907788038253784, + "learning_rate": 9.874475941733902e-06, + "loss": 0.3378, + "num_input_tokens_seen": 3626712, + "step": 5575 + }, + { + "epoch": 3.2900943396226414, + "grad_norm": 1.4493025541305542, + "learning_rate": 9.873902445011952e-06, + "loss": 0.4546, + "num_input_tokens_seen": 3629752, + "step": 5580 + }, + { + "epoch": 3.2930424528301887, + "grad_norm": 2.23189640045166, + "learning_rate": 9.873327657903324e-06, + "loss": 0.5013, + "num_input_tokens_seen": 3632536, + "step": 5585 + }, + { + "epoch": 3.295990566037736, + "grad_norm": 2.6468918323516846, + "learning_rate": 9.872751580560194e-06, + "loss": 0.3636, + "num_input_tokens_seen": 3635768, + "step": 5590 + }, + { + "epoch": 3.298938679245283, + "grad_norm": 1.497018575668335, + "learning_rate": 9.872174213135084e-06, + "loss": 0.3771, + "num_input_tokens_seen": 3638424, + "step": 5595 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 1.6334091424942017, + "learning_rate": 9.871595555780855e-06, + "loss": 0.4962, + "num_input_tokens_seen": 3641912, + "step": 5600 + }, + { + "epoch": 3.3048349056603774, + "grad_norm": 5.743154525756836, + "learning_rate": 9.871015608650705e-06, + "loss": 0.4433, + "num_input_tokens_seen": 3644856, + "step": 5605 + }, + { + "epoch": 3.3077830188679247, + "grad_norm": 0.9836852550506592, + "learning_rate": 9.870434371898182e-06, + "loss": 0.5282, + "num_input_tokens_seen": 3647960, + "step": 5610 + }, + { + "epoch": 3.3107311320754715, + "grad_norm": 1.7344549894332886, + "learning_rate": 9.869851845677165e-06, + "loss": 0.4544, + "num_input_tokens_seen": 3651416, + "step": 5615 + }, + { + "epoch": 3.313679245283019, + "grad_norm": 2.3151936531066895, + "learning_rate": 9.869268030141886e-06, + "loss": 0.4013, + "num_input_tokens_seen": 3655192, + "step": 5620 + }, + { + "epoch": 3.316627358490566, + "grad_norm": 3.1920158863067627, + "learning_rate": 9.86868292544691e-06, + "loss": 0.4207, + "num_input_tokens_seen": 3658200, + "step": 5625 + }, + { + "epoch": 3.3195754716981134, + "grad_norm": 2.742058753967285, + "learning_rate": 9.868096531747149e-06, + "loss": 0.5041, + "num_input_tokens_seen": 3661720, + "step": 5630 + }, + { + "epoch": 3.3225235849056602, + "grad_norm": 1.7880358695983887, + "learning_rate": 9.867508849197848e-06, + "loss": 0.4718, + "num_input_tokens_seen": 3664216, + "step": 5635 + }, + { + "epoch": 3.3254716981132075, + "grad_norm": 1.2627674341201782, + "learning_rate": 9.866919877954602e-06, + "loss": 0.5067, + "num_input_tokens_seen": 3667800, + "step": 5640 + }, + { + "epoch": 3.328419811320755, + "grad_norm": 3.6054415702819824, + "learning_rate": 9.866329618173344e-06, + "loss": 0.485, + "num_input_tokens_seen": 3671800, + "step": 5645 + }, + { + "epoch": 3.331367924528302, + "grad_norm": 2.8440802097320557, + "learning_rate": 9.865738070010346e-06, + "loss": 0.4444, + "num_input_tokens_seen": 3674424, + "step": 5650 + }, + { + "epoch": 3.334316037735849, + "grad_norm": 1.8752282857894897, + "learning_rate": 9.865145233622223e-06, + "loss": 0.5183, + "num_input_tokens_seen": 3677144, + "step": 5655 + }, + { + "epoch": 3.3372641509433962, + "grad_norm": 3.0664122104644775, + "learning_rate": 9.864551109165935e-06, + "loss": 0.6434, + "num_input_tokens_seen": 3679768, + "step": 5660 + }, + { + "epoch": 3.3402122641509435, + "grad_norm": 1.25641930103302, + "learning_rate": 9.863955696798773e-06, + "loss": 0.4124, + "num_input_tokens_seen": 3687576, + "step": 5665 + }, + { + "epoch": 3.3431603773584904, + "grad_norm": 1.6079306602478027, + "learning_rate": 9.863358996678378e-06, + "loss": 0.4306, + "num_input_tokens_seen": 3690936, + "step": 5670 + }, + { + "epoch": 3.3461084905660377, + "grad_norm": 1.7526575326919556, + "learning_rate": 9.86276100896273e-06, + "loss": 0.3543, + "num_input_tokens_seen": 3694616, + "step": 5675 + }, + { + "epoch": 3.349056603773585, + "grad_norm": 4.281879901885986, + "learning_rate": 9.862161733810147e-06, + "loss": 0.5386, + "num_input_tokens_seen": 3697720, + "step": 5680 + }, + { + "epoch": 3.3520047169811322, + "grad_norm": 1.758099913597107, + "learning_rate": 9.86156117137929e-06, + "loss": 0.4007, + "num_input_tokens_seen": 3700120, + "step": 5685 + }, + { + "epoch": 3.354952830188679, + "grad_norm": 2.017382860183716, + "learning_rate": 9.860959321829159e-06, + "loss": 0.4636, + "num_input_tokens_seen": 3703768, + "step": 5690 + }, + { + "epoch": 3.3579009433962264, + "grad_norm": 2.042393445968628, + "learning_rate": 9.860356185319102e-06, + "loss": 0.514, + "num_input_tokens_seen": 3707448, + "step": 5695 + }, + { + "epoch": 3.3608490566037736, + "grad_norm": 4.5664873123168945, + "learning_rate": 9.859751762008796e-06, + "loss": 0.4787, + "num_input_tokens_seen": 3709976, + "step": 5700 + }, + { + "epoch": 3.363797169811321, + "grad_norm": 1.3283536434173584, + "learning_rate": 9.859146052058266e-06, + "loss": 0.6716, + "num_input_tokens_seen": 3713208, + "step": 5705 + }, + { + "epoch": 3.3667452830188678, + "grad_norm": 1.2281264066696167, + "learning_rate": 9.858539055627876e-06, + "loss": 0.3992, + "num_input_tokens_seen": 3716504, + "step": 5710 + }, + { + "epoch": 3.369693396226415, + "grad_norm": 2.228283166885376, + "learning_rate": 9.857930772878333e-06, + "loss": 0.6256, + "num_input_tokens_seen": 3718744, + "step": 5715 + }, + { + "epoch": 3.3726415094339623, + "grad_norm": 2.2391412258148193, + "learning_rate": 9.857321203970682e-06, + "loss": 0.4481, + "num_input_tokens_seen": 3721112, + "step": 5720 + }, + { + "epoch": 3.3755896226415096, + "grad_norm": 2.3149242401123047, + "learning_rate": 9.856710349066307e-06, + "loss": 0.5411, + "num_input_tokens_seen": 3725048, + "step": 5725 + }, + { + "epoch": 3.3785377358490565, + "grad_norm": 1.328795313835144, + "learning_rate": 9.856098208326937e-06, + "loss": 0.4255, + "num_input_tokens_seen": 3728504, + "step": 5730 + }, + { + "epoch": 3.3814858490566038, + "grad_norm": 2.1709048748016357, + "learning_rate": 9.855484781914639e-06, + "loss": 0.5363, + "num_input_tokens_seen": 3731448, + "step": 5735 + }, + { + "epoch": 3.384433962264151, + "grad_norm": 2.2954556941986084, + "learning_rate": 9.854870069991817e-06, + "loss": 0.5811, + "num_input_tokens_seen": 3734808, + "step": 5740 + }, + { + "epoch": 3.387382075471698, + "grad_norm": 3.190495252609253, + "learning_rate": 9.854254072721222e-06, + "loss": 0.5007, + "num_input_tokens_seen": 3737688, + "step": 5745 + }, + { + "epoch": 3.390330188679245, + "grad_norm": 1.5982915163040161, + "learning_rate": 9.853636790265938e-06, + "loss": 0.4135, + "num_input_tokens_seen": 3740888, + "step": 5750 + }, + { + "epoch": 3.3932783018867925, + "grad_norm": 1.603284478187561, + "learning_rate": 9.853018222789397e-06, + "loss": 0.3697, + "num_input_tokens_seen": 3744184, + "step": 5755 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 1.6270633935928345, + "learning_rate": 9.852398370455367e-06, + "loss": 0.4767, + "num_input_tokens_seen": 3747384, + "step": 5760 + }, + { + "epoch": 3.3991745283018866, + "grad_norm": 4.308558464050293, + "learning_rate": 9.851777233427955e-06, + "loss": 0.5057, + "num_input_tokens_seen": 3750296, + "step": 5765 + }, + { + "epoch": 3.402122641509434, + "grad_norm": 2.298288345336914, + "learning_rate": 9.85115481187161e-06, + "loss": 0.3903, + "num_input_tokens_seen": 3753624, + "step": 5770 + }, + { + "epoch": 3.405070754716981, + "grad_norm": 1.2613229751586914, + "learning_rate": 9.850531105951123e-06, + "loss": 0.502, + "num_input_tokens_seen": 3757400, + "step": 5775 + }, + { + "epoch": 3.4080188679245285, + "grad_norm": 2.876115083694458, + "learning_rate": 9.84990611583162e-06, + "loss": 0.4246, + "num_input_tokens_seen": 3760344, + "step": 5780 + }, + { + "epoch": 3.4109669811320753, + "grad_norm": 1.5410088300704956, + "learning_rate": 9.849279841678572e-06, + "loss": 0.4753, + "num_input_tokens_seen": 3763512, + "step": 5785 + }, + { + "epoch": 3.4139150943396226, + "grad_norm": 1.900124192237854, + "learning_rate": 9.848652283657785e-06, + "loss": 0.4299, + "num_input_tokens_seen": 3766232, + "step": 5790 + }, + { + "epoch": 3.41686320754717, + "grad_norm": 1.909995198249817, + "learning_rate": 9.848023441935411e-06, + "loss": 0.3983, + "num_input_tokens_seen": 3770616, + "step": 5795 + }, + { + "epoch": 3.419811320754717, + "grad_norm": 1.2678277492523193, + "learning_rate": 9.847393316677935e-06, + "loss": 0.3599, + "num_input_tokens_seen": 3773464, + "step": 5800 + }, + { + "epoch": 3.422759433962264, + "grad_norm": 1.2165656089782715, + "learning_rate": 9.846761908052188e-06, + "loss": 0.4248, + "num_input_tokens_seen": 3776472, + "step": 5805 + }, + { + "epoch": 3.4257075471698113, + "grad_norm": 1.8495151996612549, + "learning_rate": 9.846129216225338e-06, + "loss": 0.4617, + "num_input_tokens_seen": 3780056, + "step": 5810 + }, + { + "epoch": 3.4286556603773586, + "grad_norm": 3.7332139015197754, + "learning_rate": 9.845495241364892e-06, + "loss": 0.4772, + "num_input_tokens_seen": 3783800, + "step": 5815 + }, + { + "epoch": 3.4316037735849054, + "grad_norm": 2.291036367416382, + "learning_rate": 9.844859983638696e-06, + "loss": 0.4921, + "num_input_tokens_seen": 3786744, + "step": 5820 + }, + { + "epoch": 3.4345518867924527, + "grad_norm": 2.65614652633667, + "learning_rate": 9.844223443214942e-06, + "loss": 0.5713, + "num_input_tokens_seen": 3789624, + "step": 5825 + }, + { + "epoch": 3.4375, + "grad_norm": 1.4563069343566895, + "learning_rate": 9.843585620262153e-06, + "loss": 0.4781, + "num_input_tokens_seen": 3793816, + "step": 5830 + }, + { + "epoch": 3.4404481132075473, + "grad_norm": 10.31403636932373, + "learning_rate": 9.842946514949197e-06, + "loss": 0.5455, + "num_input_tokens_seen": 3796248, + "step": 5835 + }, + { + "epoch": 3.4433962264150946, + "grad_norm": 1.3758074045181274, + "learning_rate": 9.842306127445279e-06, + "loss": 0.5237, + "num_input_tokens_seen": 3799000, + "step": 5840 + }, + { + "epoch": 3.4463443396226414, + "grad_norm": 1.8348426818847656, + "learning_rate": 9.841664457919944e-06, + "loss": 0.4977, + "num_input_tokens_seen": 3801976, + "step": 5845 + }, + { + "epoch": 3.4492924528301887, + "grad_norm": 2.4773263931274414, + "learning_rate": 9.841021506543079e-06, + "loss": 0.544, + "num_input_tokens_seen": 3805432, + "step": 5850 + }, + { + "epoch": 3.452240566037736, + "grad_norm": 1.555792212486267, + "learning_rate": 9.840377273484904e-06, + "loss": 0.4171, + "num_input_tokens_seen": 3809144, + "step": 5855 + }, + { + "epoch": 3.455188679245283, + "grad_norm": 3.9127581119537354, + "learning_rate": 9.839731758915986e-06, + "loss": 0.5361, + "num_input_tokens_seen": 3812472, + "step": 5860 + }, + { + "epoch": 3.45813679245283, + "grad_norm": 2.701272964477539, + "learning_rate": 9.839084963007226e-06, + "loss": 0.4048, + "num_input_tokens_seen": 3815512, + "step": 5865 + }, + { + "epoch": 3.4610849056603774, + "grad_norm": 1.1864343881607056, + "learning_rate": 9.838436885929868e-06, + "loss": 0.7297, + "num_input_tokens_seen": 3817816, + "step": 5870 + }, + { + "epoch": 3.4640330188679247, + "grad_norm": 2.4107208251953125, + "learning_rate": 9.837787527855492e-06, + "loss": 0.363, + "num_input_tokens_seen": 3820568, + "step": 5875 + }, + { + "epoch": 3.4669811320754715, + "grad_norm": 2.6572368144989014, + "learning_rate": 9.837136888956017e-06, + "loss": 0.4708, + "num_input_tokens_seen": 3824088, + "step": 5880 + }, + { + "epoch": 3.469929245283019, + "grad_norm": 1.6846486330032349, + "learning_rate": 9.836484969403705e-06, + "loss": 0.4792, + "num_input_tokens_seen": 3827192, + "step": 5885 + }, + { + "epoch": 3.472877358490566, + "grad_norm": 3.556777238845825, + "learning_rate": 9.835831769371152e-06, + "loss": 0.4112, + "num_input_tokens_seen": 3829400, + "step": 5890 + }, + { + "epoch": 3.4758254716981134, + "grad_norm": 2.8453385829925537, + "learning_rate": 9.835177289031298e-06, + "loss": 0.4544, + "num_input_tokens_seen": 3831480, + "step": 5895 + }, + { + "epoch": 3.4787735849056602, + "grad_norm": 1.375619888305664, + "learning_rate": 9.834521528557419e-06, + "loss": 0.5183, + "num_input_tokens_seen": 3835032, + "step": 5900 + }, + { + "epoch": 3.4817216981132075, + "grad_norm": 1.86631178855896, + "learning_rate": 9.833864488123128e-06, + "loss": 0.3635, + "num_input_tokens_seen": 3838008, + "step": 5905 + }, + { + "epoch": 3.484669811320755, + "grad_norm": 1.8732240200042725, + "learning_rate": 9.83320616790238e-06, + "loss": 0.4173, + "num_input_tokens_seen": 3841432, + "step": 5910 + }, + { + "epoch": 3.487617924528302, + "grad_norm": 1.6845920085906982, + "learning_rate": 9.832546568069472e-06, + "loss": 0.4998, + "num_input_tokens_seen": 3844344, + "step": 5915 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 2.3121297359466553, + "learning_rate": 9.831885688799031e-06, + "loss": 0.4914, + "num_input_tokens_seen": 3847544, + "step": 5920 + }, + { + "epoch": 3.4935141509433962, + "grad_norm": 1.8233729600906372, + "learning_rate": 9.83122353026603e-06, + "loss": 0.8029, + "num_input_tokens_seen": 3850616, + "step": 5925 + }, + { + "epoch": 3.4964622641509435, + "grad_norm": 1.4017064571380615, + "learning_rate": 9.830560092645778e-06, + "loss": 0.444, + "num_input_tokens_seen": 3854104, + "step": 5930 + }, + { + "epoch": 3.4994103773584904, + "grad_norm": 1.9475303888320923, + "learning_rate": 9.829895376113923e-06, + "loss": 0.4993, + "num_input_tokens_seen": 3858296, + "step": 5935 + }, + { + "epoch": 3.5023584905660377, + "grad_norm": 5.680035591125488, + "learning_rate": 9.829229380846452e-06, + "loss": 0.4349, + "num_input_tokens_seen": 3860952, + "step": 5940 + }, + { + "epoch": 3.505306603773585, + "grad_norm": 2.709632396697998, + "learning_rate": 9.82856210701969e-06, + "loss": 0.5228, + "num_input_tokens_seen": 3863800, + "step": 5945 + }, + { + "epoch": 3.5082547169811322, + "grad_norm": 2.132789134979248, + "learning_rate": 9.827893554810298e-06, + "loss": 0.5692, + "num_input_tokens_seen": 3867800, + "step": 5950 + }, + { + "epoch": 3.5112028301886795, + "grad_norm": 2.1033480167388916, + "learning_rate": 9.827223724395281e-06, + "loss": 0.5173, + "num_input_tokens_seen": 3870360, + "step": 5955 + }, + { + "epoch": 3.5141509433962264, + "grad_norm": 1.6832857131958008, + "learning_rate": 9.82655261595198e-06, + "loss": 0.4577, + "num_input_tokens_seen": 3873432, + "step": 5960 + }, + { + "epoch": 3.5170990566037736, + "grad_norm": 2.3264310359954834, + "learning_rate": 9.825880229658073e-06, + "loss": 0.5061, + "num_input_tokens_seen": 3876376, + "step": 5965 + }, + { + "epoch": 3.5200471698113205, + "grad_norm": 1.7049273252487183, + "learning_rate": 9.825206565691576e-06, + "loss": 0.5066, + "num_input_tokens_seen": 3879352, + "step": 5970 + }, + { + "epoch": 3.5229952830188678, + "grad_norm": 1.656238317489624, + "learning_rate": 9.824531624230844e-06, + "loss": 0.4673, + "num_input_tokens_seen": 3882712, + "step": 5975 + }, + { + "epoch": 3.525943396226415, + "grad_norm": 1.9921181201934814, + "learning_rate": 9.823855405454573e-06, + "loss": 0.5376, + "num_input_tokens_seen": 3886072, + "step": 5980 + }, + { + "epoch": 3.5288915094339623, + "grad_norm": 2.7016568183898926, + "learning_rate": 9.823177909541795e-06, + "loss": 0.4141, + "num_input_tokens_seen": 3889304, + "step": 5985 + }, + { + "epoch": 3.5318396226415096, + "grad_norm": 3.4773778915405273, + "learning_rate": 9.822499136671877e-06, + "loss": 0.3906, + "num_input_tokens_seen": 3892600, + "step": 5990 + }, + { + "epoch": 3.5347877358490565, + "grad_norm": 2.423518419265747, + "learning_rate": 9.82181908702453e-06, + "loss": 0.4666, + "num_input_tokens_seen": 3895992, + "step": 5995 + }, + { + "epoch": 3.5377358490566038, + "grad_norm": 2.0235812664031982, + "learning_rate": 9.821137760779797e-06, + "loss": 0.6857, + "num_input_tokens_seen": 3899480, + "step": 6000 + }, + { + "epoch": 3.540683962264151, + "grad_norm": 2.6161768436431885, + "learning_rate": 9.820455158118065e-06, + "loss": 0.4807, + "num_input_tokens_seen": 3901880, + "step": 6005 + }, + { + "epoch": 3.543632075471698, + "grad_norm": 2.06429123878479, + "learning_rate": 9.819771279220053e-06, + "loss": 0.4873, + "num_input_tokens_seen": 3906808, + "step": 6010 + }, + { + "epoch": 3.546580188679245, + "grad_norm": 1.429808497428894, + "learning_rate": 9.819086124266825e-06, + "loss": 0.4121, + "num_input_tokens_seen": 3910296, + "step": 6015 + }, + { + "epoch": 3.5495283018867925, + "grad_norm": 3.465043783187866, + "learning_rate": 9.818399693439778e-06, + "loss": 0.5161, + "num_input_tokens_seen": 3913400, + "step": 6020 + }, + { + "epoch": 3.5524764150943398, + "grad_norm": 1.3960872888565063, + "learning_rate": 9.817711986920644e-06, + "loss": 0.4391, + "num_input_tokens_seen": 3915896, + "step": 6025 + }, + { + "epoch": 3.555424528301887, + "grad_norm": 2.4132487773895264, + "learning_rate": 9.817023004891497e-06, + "loss": 0.4052, + "num_input_tokens_seen": 3919352, + "step": 6030 + }, + { + "epoch": 3.558372641509434, + "grad_norm": 7.3894171714782715, + "learning_rate": 9.816332747534752e-06, + "loss": 0.6553, + "num_input_tokens_seen": 3921560, + "step": 6035 + }, + { + "epoch": 3.561320754716981, + "grad_norm": 5.019872665405273, + "learning_rate": 9.815641215033153e-06, + "loss": 0.6066, + "num_input_tokens_seen": 3924824, + "step": 6040 + }, + { + "epoch": 3.5642688679245285, + "grad_norm": 0.8585574626922607, + "learning_rate": 9.814948407569789e-06, + "loss": 0.4676, + "num_input_tokens_seen": 3928216, + "step": 6045 + }, + { + "epoch": 3.5672169811320753, + "grad_norm": 2.468637228012085, + "learning_rate": 9.814254325328082e-06, + "loss": 0.8209, + "num_input_tokens_seen": 3931192, + "step": 6050 + }, + { + "epoch": 3.5701650943396226, + "grad_norm": 1.4459161758422852, + "learning_rate": 9.813558968491794e-06, + "loss": 0.3313, + "num_input_tokens_seen": 3934072, + "step": 6055 + }, + { + "epoch": 3.57311320754717, + "grad_norm": 2.385265350341797, + "learning_rate": 9.812862337245024e-06, + "loss": 0.4213, + "num_input_tokens_seen": 3937048, + "step": 6060 + }, + { + "epoch": 3.576061320754717, + "grad_norm": 1.4659218788146973, + "learning_rate": 9.812164431772208e-06, + "loss": 0.4467, + "num_input_tokens_seen": 3940472, + "step": 6065 + }, + { + "epoch": 3.579009433962264, + "grad_norm": 2.363335132598877, + "learning_rate": 9.81146525225812e-06, + "loss": 0.4232, + "num_input_tokens_seen": 3943352, + "step": 6070 + }, + { + "epoch": 3.5819575471698113, + "grad_norm": 2.0104377269744873, + "learning_rate": 9.810764798887868e-06, + "loss": 0.4158, + "num_input_tokens_seen": 3948120, + "step": 6075 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 2.774599313735962, + "learning_rate": 9.810063071846905e-06, + "loss": 0.4391, + "num_input_tokens_seen": 3951640, + "step": 6080 + }, + { + "epoch": 3.5878537735849054, + "grad_norm": 3.1206107139587402, + "learning_rate": 9.809360071321013e-06, + "loss": 0.5251, + "num_input_tokens_seen": 3954232, + "step": 6085 + }, + { + "epoch": 3.5908018867924527, + "grad_norm": 1.4326025247573853, + "learning_rate": 9.808655797496314e-06, + "loss": 0.3834, + "num_input_tokens_seen": 3957592, + "step": 6090 + }, + { + "epoch": 3.59375, + "grad_norm": 2.290938138961792, + "learning_rate": 9.807950250559268e-06, + "loss": 0.4787, + "num_input_tokens_seen": 3960408, + "step": 6095 + }, + { + "epoch": 3.5966981132075473, + "grad_norm": 2.7492825984954834, + "learning_rate": 9.807243430696673e-06, + "loss": 0.4039, + "num_input_tokens_seen": 3963768, + "step": 6100 + }, + { + "epoch": 3.5996462264150946, + "grad_norm": 2.6364223957061768, + "learning_rate": 9.806535338095661e-06, + "loss": 0.3762, + "num_input_tokens_seen": 3967128, + "step": 6105 + }, + { + "epoch": 3.6025943396226414, + "grad_norm": 6.124648571014404, + "learning_rate": 9.805825972943706e-06, + "loss": 0.4224, + "num_input_tokens_seen": 3969304, + "step": 6110 + }, + { + "epoch": 3.6055424528301887, + "grad_norm": 1.3641059398651123, + "learning_rate": 9.80511533542861e-06, + "loss": 0.3702, + "num_input_tokens_seen": 3972024, + "step": 6115 + }, + { + "epoch": 3.608490566037736, + "grad_norm": 2.9206619262695312, + "learning_rate": 9.80440342573852e-06, + "loss": 0.4523, + "num_input_tokens_seen": 3975032, + "step": 6120 + }, + { + "epoch": 3.611438679245283, + "grad_norm": 1.459501028060913, + "learning_rate": 9.803690244061919e-06, + "loss": 0.4127, + "num_input_tokens_seen": 3977944, + "step": 6125 + }, + { + "epoch": 3.61438679245283, + "grad_norm": 2.5439250469207764, + "learning_rate": 9.802975790587621e-06, + "loss": 0.4905, + "num_input_tokens_seen": 3981304, + "step": 6130 + }, + { + "epoch": 3.6173349056603774, + "grad_norm": 1.7963510751724243, + "learning_rate": 9.802260065504783e-06, + "loss": 0.5802, + "num_input_tokens_seen": 3984792, + "step": 6135 + }, + { + "epoch": 3.6202830188679247, + "grad_norm": 1.4936938285827637, + "learning_rate": 9.801543069002897e-06, + "loss": 0.3999, + "num_input_tokens_seen": 3987832, + "step": 6140 + }, + { + "epoch": 3.6232311320754715, + "grad_norm": 16.707948684692383, + "learning_rate": 9.80082480127179e-06, + "loss": 0.3942, + "num_input_tokens_seen": 3991800, + "step": 6145 + }, + { + "epoch": 3.626179245283019, + "grad_norm": 1.1759858131408691, + "learning_rate": 9.800105262501628e-06, + "loss": 0.4654, + "num_input_tokens_seen": 3995096, + "step": 6150 + }, + { + "epoch": 3.629127358490566, + "grad_norm": 1.2978332042694092, + "learning_rate": 9.799384452882907e-06, + "loss": 0.479, + "num_input_tokens_seen": 3997880, + "step": 6155 + }, + { + "epoch": 3.632075471698113, + "grad_norm": 2.4032607078552246, + "learning_rate": 9.798662372606469e-06, + "loss": 0.4028, + "num_input_tokens_seen": 4001400, + "step": 6160 + }, + { + "epoch": 3.6350235849056602, + "grad_norm": 2.098317861557007, + "learning_rate": 9.797939021863487e-06, + "loss": 0.4583, + "num_input_tokens_seen": 4004344, + "step": 6165 + }, + { + "epoch": 3.6379716981132075, + "grad_norm": 6.382221221923828, + "learning_rate": 9.797214400845472e-06, + "loss": 0.6341, + "num_input_tokens_seen": 4007416, + "step": 6170 + }, + { + "epoch": 3.640919811320755, + "grad_norm": 3.4683034420013428, + "learning_rate": 9.796488509744269e-06, + "loss": 0.5083, + "num_input_tokens_seen": 4010488, + "step": 6175 + }, + { + "epoch": 3.643867924528302, + "grad_norm": 2.5313379764556885, + "learning_rate": 9.79576134875206e-06, + "loss": 0.5237, + "num_input_tokens_seen": 4013144, + "step": 6180 + }, + { + "epoch": 3.646816037735849, + "grad_norm": 2.1807286739349365, + "learning_rate": 9.795032918061367e-06, + "loss": 0.512, + "num_input_tokens_seen": 4016312, + "step": 6185 + }, + { + "epoch": 3.6497641509433962, + "grad_norm": 1.6005451679229736, + "learning_rate": 9.794303217865041e-06, + "loss": 0.5066, + "num_input_tokens_seen": 4019896, + "step": 6190 + }, + { + "epoch": 3.6527122641509435, + "grad_norm": 1.542596697807312, + "learning_rate": 9.79357224835628e-06, + "loss": 0.5833, + "num_input_tokens_seen": 4022872, + "step": 6195 + }, + { + "epoch": 3.6556603773584904, + "grad_norm": 1.9818195104599, + "learning_rate": 9.792840009728605e-06, + "loss": 0.4617, + "num_input_tokens_seen": 4025912, + "step": 6200 + }, + { + "epoch": 3.6586084905660377, + "grad_norm": 1.441719651222229, + "learning_rate": 9.79210650217588e-06, + "loss": 0.4697, + "num_input_tokens_seen": 4029208, + "step": 6205 + }, + { + "epoch": 3.661556603773585, + "grad_norm": 1.5752439498901367, + "learning_rate": 9.791371725892307e-06, + "loss": 0.5245, + "num_input_tokens_seen": 4032120, + "step": 6210 + }, + { + "epoch": 3.6645047169811322, + "grad_norm": 1.0782500505447388, + "learning_rate": 9.79063568107242e-06, + "loss": 0.4678, + "num_input_tokens_seen": 4035576, + "step": 6215 + }, + { + "epoch": 3.6674528301886795, + "grad_norm": 1.9574594497680664, + "learning_rate": 9.78989836791109e-06, + "loss": 0.4948, + "num_input_tokens_seen": 4039096, + "step": 6220 + }, + { + "epoch": 3.6704009433962264, + "grad_norm": 1.5621784925460815, + "learning_rate": 9.789159786603524e-06, + "loss": 0.4846, + "num_input_tokens_seen": 4045720, + "step": 6225 + }, + { + "epoch": 3.6733490566037736, + "grad_norm": 1.8161766529083252, + "learning_rate": 9.788419937345263e-06, + "loss": 0.5236, + "num_input_tokens_seen": 4048760, + "step": 6230 + }, + { + "epoch": 3.6762971698113205, + "grad_norm": 1.7863855361938477, + "learning_rate": 9.787678820332188e-06, + "loss": 0.5792, + "num_input_tokens_seen": 4052408, + "step": 6235 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 1.747335433959961, + "learning_rate": 9.78693643576051e-06, + "loss": 0.3448, + "num_input_tokens_seen": 4054872, + "step": 6240 + }, + { + "epoch": 3.682193396226415, + "grad_norm": 3.1540074348449707, + "learning_rate": 9.786192783826782e-06, + "loss": 0.4117, + "num_input_tokens_seen": 4057304, + "step": 6245 + }, + { + "epoch": 3.6851415094339623, + "grad_norm": 1.4725329875946045, + "learning_rate": 9.785447864727887e-06, + "loss": 0.4217, + "num_input_tokens_seen": 4060056, + "step": 6250 + }, + { + "epoch": 3.6880896226415096, + "grad_norm": 1.585604190826416, + "learning_rate": 9.784701678661045e-06, + "loss": 0.45, + "num_input_tokens_seen": 4062808, + "step": 6255 + }, + { + "epoch": 3.6910377358490565, + "grad_norm": 1.9884909391403198, + "learning_rate": 9.783954225823813e-06, + "loss": 0.3605, + "num_input_tokens_seen": 4066360, + "step": 6260 + }, + { + "epoch": 3.6939858490566038, + "grad_norm": 1.9366538524627686, + "learning_rate": 9.783205506414082e-06, + "loss": 0.6737, + "num_input_tokens_seen": 4071352, + "step": 6265 + }, + { + "epoch": 3.696933962264151, + "grad_norm": 1.231418251991272, + "learning_rate": 9.782455520630079e-06, + "loss": 0.4795, + "num_input_tokens_seen": 4076952, + "step": 6270 + }, + { + "epoch": 3.699882075471698, + "grad_norm": 1.4096026420593262, + "learning_rate": 9.781704268670364e-06, + "loss": 0.5517, + "num_input_tokens_seen": 4080952, + "step": 6275 + }, + { + "epoch": 3.702830188679245, + "grad_norm": 1.0999194383621216, + "learning_rate": 9.780951750733837e-06, + "loss": 0.4417, + "num_input_tokens_seen": 4084888, + "step": 6280 + }, + { + "epoch": 3.7057783018867925, + "grad_norm": 1.1066210269927979, + "learning_rate": 9.780197967019728e-06, + "loss": 0.414, + "num_input_tokens_seen": 4087640, + "step": 6285 + }, + { + "epoch": 3.7087264150943398, + "grad_norm": 3.3654232025146484, + "learning_rate": 9.779442917727608e-06, + "loss": 0.5131, + "num_input_tokens_seen": 4090584, + "step": 6290 + }, + { + "epoch": 3.711674528301887, + "grad_norm": 2.059638738632202, + "learning_rate": 9.778686603057377e-06, + "loss": 0.428, + "num_input_tokens_seen": 4096248, + "step": 6295 + }, + { + "epoch": 3.714622641509434, + "grad_norm": 2.241614580154419, + "learning_rate": 9.777929023209271e-06, + "loss": 0.5105, + "num_input_tokens_seen": 4099000, + "step": 6300 + }, + { + "epoch": 3.717570754716981, + "grad_norm": 3.676776170730591, + "learning_rate": 9.777170178383866e-06, + "loss": 0.4514, + "num_input_tokens_seen": 4102968, + "step": 6305 + }, + { + "epoch": 3.7205188679245285, + "grad_norm": 2.588263511657715, + "learning_rate": 9.776410068782068e-06, + "loss": 0.4082, + "num_input_tokens_seen": 4106136, + "step": 6310 + }, + { + "epoch": 3.7234669811320753, + "grad_norm": 1.212329626083374, + "learning_rate": 9.775648694605118e-06, + "loss": 0.4888, + "num_input_tokens_seen": 4109912, + "step": 6315 + }, + { + "epoch": 3.7264150943396226, + "grad_norm": 2.049220323562622, + "learning_rate": 9.774886056054593e-06, + "loss": 0.5083, + "num_input_tokens_seen": 4112920, + "step": 6320 + }, + { + "epoch": 3.72936320754717, + "grad_norm": 1.1754775047302246, + "learning_rate": 9.774122153332408e-06, + "loss": 0.4457, + "num_input_tokens_seen": 4116152, + "step": 6325 + }, + { + "epoch": 3.732311320754717, + "grad_norm": 3.4246468544006348, + "learning_rate": 9.773356986640807e-06, + "loss": 0.4649, + "num_input_tokens_seen": 4120280, + "step": 6330 + }, + { + "epoch": 3.735259433962264, + "grad_norm": 2.438265085220337, + "learning_rate": 9.772590556182373e-06, + "loss": 0.4834, + "num_input_tokens_seen": 4122840, + "step": 6335 + }, + { + "epoch": 3.7382075471698113, + "grad_norm": 1.4794487953186035, + "learning_rate": 9.77182286216002e-06, + "loss": 0.3839, + "num_input_tokens_seen": 4126232, + "step": 6340 + }, + { + "epoch": 3.7411556603773586, + "grad_norm": 2.042238712310791, + "learning_rate": 9.771053904776998e-06, + "loss": 0.4966, + "num_input_tokens_seen": 4129752, + "step": 6345 + }, + { + "epoch": 3.7441037735849054, + "grad_norm": 2.4753189086914062, + "learning_rate": 9.770283684236891e-06, + "loss": 0.7019, + "num_input_tokens_seen": 4132280, + "step": 6350 + }, + { + "epoch": 3.7470518867924527, + "grad_norm": 1.3164738416671753, + "learning_rate": 9.769512200743623e-06, + "loss": 0.4648, + "num_input_tokens_seen": 4135864, + "step": 6355 + }, + { + "epoch": 3.75, + "grad_norm": 2.447172164916992, + "learning_rate": 9.768739454501444e-06, + "loss": 0.5098, + "num_input_tokens_seen": 4138680, + "step": 6360 + }, + { + "epoch": 3.7529481132075473, + "grad_norm": 5.2237725257873535, + "learning_rate": 9.76796544571494e-06, + "loss": 0.4696, + "num_input_tokens_seen": 4141368, + "step": 6365 + }, + { + "epoch": 3.7558962264150946, + "grad_norm": 2.027153253555298, + "learning_rate": 9.767190174589036e-06, + "loss": 0.448, + "num_input_tokens_seen": 4144760, + "step": 6370 + }, + { + "epoch": 3.7588443396226414, + "grad_norm": 3.7473416328430176, + "learning_rate": 9.76641364132899e-06, + "loss": 0.5654, + "num_input_tokens_seen": 4147416, + "step": 6375 + }, + { + "epoch": 3.7617924528301887, + "grad_norm": 1.8635953664779663, + "learning_rate": 9.765635846140389e-06, + "loss": 0.483, + "num_input_tokens_seen": 4150328, + "step": 6380 + }, + { + "epoch": 3.764740566037736, + "grad_norm": 1.0663868188858032, + "learning_rate": 9.764856789229157e-06, + "loss": 0.4791, + "num_input_tokens_seen": 4153528, + "step": 6385 + }, + { + "epoch": 3.767688679245283, + "grad_norm": 3.030426025390625, + "learning_rate": 9.764076470801557e-06, + "loss": 0.3479, + "num_input_tokens_seen": 4159896, + "step": 6390 + }, + { + "epoch": 3.77063679245283, + "grad_norm": 1.5392131805419922, + "learning_rate": 9.763294891064182e-06, + "loss": 0.4897, + "num_input_tokens_seen": 4163160, + "step": 6395 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 1.3763707876205444, + "learning_rate": 9.762512050223951e-06, + "loss": 0.5124, + "num_input_tokens_seen": 4166168, + "step": 6400 + }, + { + "epoch": 3.7765330188679247, + "grad_norm": 2.2697227001190186, + "learning_rate": 9.761727948488132e-06, + "loss": 0.4488, + "num_input_tokens_seen": 4169016, + "step": 6405 + }, + { + "epoch": 3.7794811320754715, + "grad_norm": 1.6069093942642212, + "learning_rate": 9.760942586064315e-06, + "loss": 0.6208, + "num_input_tokens_seen": 4172248, + "step": 6410 + }, + { + "epoch": 3.782429245283019, + "grad_norm": 2.738206624984741, + "learning_rate": 9.760155963160431e-06, + "loss": 0.4391, + "num_input_tokens_seen": 4175000, + "step": 6415 + }, + { + "epoch": 3.785377358490566, + "grad_norm": 2.1404919624328613, + "learning_rate": 9.759368079984741e-06, + "loss": 0.5175, + "num_input_tokens_seen": 4178168, + "step": 6420 + }, + { + "epoch": 3.788325471698113, + "grad_norm": 1.4808086156845093, + "learning_rate": 9.758578936745839e-06, + "loss": 0.5281, + "num_input_tokens_seen": 4181688, + "step": 6425 + }, + { + "epoch": 3.7912735849056602, + "grad_norm": 4.177740097045898, + "learning_rate": 9.757788533652656e-06, + "loss": 0.4679, + "num_input_tokens_seen": 4184728, + "step": 6430 + }, + { + "epoch": 3.7942216981132075, + "grad_norm": 1.8638392686843872, + "learning_rate": 9.756996870914454e-06, + "loss": 0.3743, + "num_input_tokens_seen": 4188184, + "step": 6435 + }, + { + "epoch": 3.797169811320755, + "grad_norm": 2.0354652404785156, + "learning_rate": 9.756203948740828e-06, + "loss": 0.4798, + "num_input_tokens_seen": 4190808, + "step": 6440 + }, + { + "epoch": 3.800117924528302, + "grad_norm": 1.6854146718978882, + "learning_rate": 9.755409767341709e-06, + "loss": 0.4782, + "num_input_tokens_seen": 4194808, + "step": 6445 + }, + { + "epoch": 3.803066037735849, + "grad_norm": 2.303384304046631, + "learning_rate": 9.75461432692736e-06, + "loss": 0.5435, + "num_input_tokens_seen": 4197400, + "step": 6450 + }, + { + "epoch": 3.8060141509433962, + "grad_norm": 1.7422261238098145, + "learning_rate": 9.753817627708375e-06, + "loss": 0.5429, + "num_input_tokens_seen": 4199736, + "step": 6455 + }, + { + "epoch": 3.8089622641509435, + "grad_norm": 1.3491671085357666, + "learning_rate": 9.753019669895686e-06, + "loss": 0.598, + "num_input_tokens_seen": 4202680, + "step": 6460 + }, + { + "epoch": 3.8119103773584904, + "grad_norm": 1.4366841316223145, + "learning_rate": 9.752220453700556e-06, + "loss": 0.5387, + "num_input_tokens_seen": 4205464, + "step": 6465 + }, + { + "epoch": 3.8148584905660377, + "grad_norm": 1.5678027868270874, + "learning_rate": 9.75141997933458e-06, + "loss": 0.5288, + "num_input_tokens_seen": 4208856, + "step": 6470 + }, + { + "epoch": 3.817806603773585, + "grad_norm": 1.3407434225082397, + "learning_rate": 9.750618247009685e-06, + "loss": 0.5233, + "num_input_tokens_seen": 4212728, + "step": 6475 + }, + { + "epoch": 3.8207547169811322, + "grad_norm": 1.1687155961990356, + "learning_rate": 9.749815256938138e-06, + "loss": 0.4468, + "num_input_tokens_seen": 4215960, + "step": 6480 + }, + { + "epoch": 3.8237028301886795, + "grad_norm": 1.9046894311904907, + "learning_rate": 9.749011009332529e-06, + "loss": 0.3658, + "num_input_tokens_seen": 4219096, + "step": 6485 + }, + { + "epoch": 3.8266509433962264, + "grad_norm": 2.041167736053467, + "learning_rate": 9.748205504405787e-06, + "loss": 0.4501, + "num_input_tokens_seen": 4223032, + "step": 6490 + }, + { + "epoch": 3.8295990566037736, + "grad_norm": 1.5436069965362549, + "learning_rate": 9.747398742371177e-06, + "loss": 0.4556, + "num_input_tokens_seen": 4225656, + "step": 6495 + }, + { + "epoch": 3.8325471698113205, + "grad_norm": 2.265415668487549, + "learning_rate": 9.746590723442289e-06, + "loss": 0.4178, + "num_input_tokens_seen": 4229720, + "step": 6500 + }, + { + "epoch": 3.8354952830188678, + "grad_norm": 2.1142921447753906, + "learning_rate": 9.745781447833049e-06, + "loss": 0.3584, + "num_input_tokens_seen": 4233464, + "step": 6505 + }, + { + "epoch": 3.838443396226415, + "grad_norm": 6.028593063354492, + "learning_rate": 9.74497091575772e-06, + "loss": 0.4143, + "num_input_tokens_seen": 4236120, + "step": 6510 + }, + { + "epoch": 3.8413915094339623, + "grad_norm": 2.431793451309204, + "learning_rate": 9.744159127430888e-06, + "loss": 0.6568, + "num_input_tokens_seen": 4239448, + "step": 6515 + }, + { + "epoch": 3.8443396226415096, + "grad_norm": 3.3506648540496826, + "learning_rate": 9.743346083067482e-06, + "loss": 0.3596, + "num_input_tokens_seen": 4242616, + "step": 6520 + }, + { + "epoch": 3.8472877358490565, + "grad_norm": 4.8003435134887695, + "learning_rate": 9.742531782882758e-06, + "loss": 0.5789, + "num_input_tokens_seen": 4245528, + "step": 6525 + }, + { + "epoch": 3.8502358490566038, + "grad_norm": 1.9072314500808716, + "learning_rate": 9.741716227092305e-06, + "loss": 0.4895, + "num_input_tokens_seen": 4248120, + "step": 6530 + }, + { + "epoch": 3.853183962264151, + "grad_norm": 1.9113293886184692, + "learning_rate": 9.740899415912048e-06, + "loss": 0.5454, + "num_input_tokens_seen": 4250808, + "step": 6535 + }, + { + "epoch": 3.856132075471698, + "grad_norm": 3.0875918865203857, + "learning_rate": 9.740081349558236e-06, + "loss": 0.4629, + "num_input_tokens_seen": 4254328, + "step": 6540 + }, + { + "epoch": 3.859080188679245, + "grad_norm": 1.70663321018219, + "learning_rate": 9.739262028247459e-06, + "loss": 0.6465, + "num_input_tokens_seen": 4260024, + "step": 6545 + }, + { + "epoch": 3.8620283018867925, + "grad_norm": 2.228905439376831, + "learning_rate": 9.738441452196633e-06, + "loss": 0.3888, + "num_input_tokens_seen": 4262680, + "step": 6550 + }, + { + "epoch": 3.8649764150943398, + "grad_norm": 3.6392974853515625, + "learning_rate": 9.737619621623013e-06, + "loss": 0.4758, + "num_input_tokens_seen": 4265784, + "step": 6555 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 1.7249586582183838, + "learning_rate": 9.73679653674418e-06, + "loss": 0.5636, + "num_input_tokens_seen": 4268792, + "step": 6560 + }, + { + "epoch": 3.870872641509434, + "grad_norm": 5.577042102813721, + "learning_rate": 9.735972197778047e-06, + "loss": 0.5459, + "num_input_tokens_seen": 4271576, + "step": 6565 + }, + { + "epoch": 3.873820754716981, + "grad_norm": 2.455580711364746, + "learning_rate": 9.735146604942867e-06, + "loss": 0.3232, + "num_input_tokens_seen": 4274360, + "step": 6570 + }, + { + "epoch": 3.8767688679245285, + "grad_norm": 3.610812187194824, + "learning_rate": 9.734319758457214e-06, + "loss": 0.4813, + "num_input_tokens_seen": 4277144, + "step": 6575 + }, + { + "epoch": 3.8797169811320753, + "grad_norm": 1.7456868886947632, + "learning_rate": 9.733491658540001e-06, + "loss": 0.4674, + "num_input_tokens_seen": 4280184, + "step": 6580 + }, + { + "epoch": 3.8826650943396226, + "grad_norm": 1.9427430629730225, + "learning_rate": 9.732662305410474e-06, + "loss": 0.4806, + "num_input_tokens_seen": 4282616, + "step": 6585 + }, + { + "epoch": 3.88561320754717, + "grad_norm": 1.8242812156677246, + "learning_rate": 9.731831699288203e-06, + "loss": 0.7064, + "num_input_tokens_seen": 4286072, + "step": 6590 + }, + { + "epoch": 3.888561320754717, + "grad_norm": 1.366325855255127, + "learning_rate": 9.730999840393096e-06, + "loss": 0.4256, + "num_input_tokens_seen": 4289496, + "step": 6595 + }, + { + "epoch": 3.891509433962264, + "grad_norm": 2.40189266204834, + "learning_rate": 9.730166728945391e-06, + "loss": 0.4116, + "num_input_tokens_seen": 4292280, + "step": 6600 + }, + { + "epoch": 3.8944575471698113, + "grad_norm": 3.8786261081695557, + "learning_rate": 9.72933236516566e-06, + "loss": 0.5178, + "num_input_tokens_seen": 4295096, + "step": 6605 + }, + { + "epoch": 3.8974056603773586, + "grad_norm": 2.2663419246673584, + "learning_rate": 9.728496749274806e-06, + "loss": 0.434, + "num_input_tokens_seen": 4299288, + "step": 6610 + }, + { + "epoch": 3.9003537735849054, + "grad_norm": 1.5046064853668213, + "learning_rate": 9.727659881494054e-06, + "loss": 0.5354, + "num_input_tokens_seen": 4304376, + "step": 6615 + }, + { + "epoch": 3.9033018867924527, + "grad_norm": 2.110302686691284, + "learning_rate": 9.726821762044975e-06, + "loss": 0.5407, + "num_input_tokens_seen": 4309432, + "step": 6620 + }, + { + "epoch": 3.90625, + "grad_norm": 1.2773325443267822, + "learning_rate": 9.725982391149465e-06, + "loss": 0.4264, + "num_input_tokens_seen": 4313432, + "step": 6625 + }, + { + "epoch": 3.9091981132075473, + "grad_norm": 1.06777822971344, + "learning_rate": 9.725141769029747e-06, + "loss": 0.4589, + "num_input_tokens_seen": 4317464, + "step": 6630 + }, + { + "epoch": 3.9121462264150946, + "grad_norm": 1.6418951749801636, + "learning_rate": 9.72429989590838e-06, + "loss": 0.4014, + "num_input_tokens_seen": 4320280, + "step": 6635 + }, + { + "epoch": 3.9150943396226414, + "grad_norm": 1.787442684173584, + "learning_rate": 9.723456772008257e-06, + "loss": 0.5671, + "num_input_tokens_seen": 4323576, + "step": 6640 + }, + { + "epoch": 3.9180424528301887, + "grad_norm": 2.2333014011383057, + "learning_rate": 9.722612397552598e-06, + "loss": 0.3822, + "num_input_tokens_seen": 4326616, + "step": 6645 + }, + { + "epoch": 3.920990566037736, + "grad_norm": 1.1333509683609009, + "learning_rate": 9.72176677276495e-06, + "loss": 0.5446, + "num_input_tokens_seen": 4330168, + "step": 6650 + }, + { + "epoch": 3.923938679245283, + "grad_norm": 1.2473218441009521, + "learning_rate": 9.7209198978692e-06, + "loss": 0.4958, + "num_input_tokens_seen": 4333464, + "step": 6655 + }, + { + "epoch": 3.92688679245283, + "grad_norm": 2.5179078578948975, + "learning_rate": 9.720071773089564e-06, + "loss": 0.6165, + "num_input_tokens_seen": 4336504, + "step": 6660 + }, + { + "epoch": 3.9298349056603774, + "grad_norm": 1.5156223773956299, + "learning_rate": 9.71922239865058e-06, + "loss": 0.4006, + "num_input_tokens_seen": 4340088, + "step": 6665 + }, + { + "epoch": 3.9327830188679247, + "grad_norm": 1.633601188659668, + "learning_rate": 9.718371774777131e-06, + "loss": 0.3981, + "num_input_tokens_seen": 4344120, + "step": 6670 + }, + { + "epoch": 3.9357311320754715, + "grad_norm": 1.2970178127288818, + "learning_rate": 9.717519901694416e-06, + "loss": 0.6055, + "num_input_tokens_seen": 4347928, + "step": 6675 + }, + { + "epoch": 3.938679245283019, + "grad_norm": 2.250190019607544, + "learning_rate": 9.716666779627978e-06, + "loss": 0.5198, + "num_input_tokens_seen": 4351192, + "step": 6680 + }, + { + "epoch": 3.941627358490566, + "grad_norm": 4.729641914367676, + "learning_rate": 9.715812408803681e-06, + "loss": 0.4781, + "num_input_tokens_seen": 4353592, + "step": 6685 + }, + { + "epoch": 3.944575471698113, + "grad_norm": 1.5684490203857422, + "learning_rate": 9.714956789447726e-06, + "loss": 0.4144, + "num_input_tokens_seen": 4356632, + "step": 6690 + }, + { + "epoch": 3.9475235849056602, + "grad_norm": 1.032141089439392, + "learning_rate": 9.71409992178664e-06, + "loss": 0.3851, + "num_input_tokens_seen": 4359864, + "step": 6695 + }, + { + "epoch": 3.9504716981132075, + "grad_norm": 1.2516428232192993, + "learning_rate": 9.713241806047282e-06, + "loss": 0.4902, + "num_input_tokens_seen": 4363480, + "step": 6700 + }, + { + "epoch": 3.953419811320755, + "grad_norm": 1.492336392402649, + "learning_rate": 9.712382442456845e-06, + "loss": 0.5751, + "num_input_tokens_seen": 4367544, + "step": 6705 + }, + { + "epoch": 3.956367924528302, + "grad_norm": 1.9288151264190674, + "learning_rate": 9.711521831242846e-06, + "loss": 0.4691, + "num_input_tokens_seen": 4370712, + "step": 6710 + }, + { + "epoch": 3.959316037735849, + "grad_norm": 2.6786880493164062, + "learning_rate": 9.710659972633137e-06, + "loss": 0.3743, + "num_input_tokens_seen": 4374136, + "step": 6715 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 1.5475469827651978, + "learning_rate": 9.709796866855899e-06, + "loss": 0.3075, + "num_input_tokens_seen": 4376984, + "step": 6720 + }, + { + "epoch": 3.9652122641509435, + "grad_norm": 1.9753527641296387, + "learning_rate": 9.70893251413964e-06, + "loss": 0.5512, + "num_input_tokens_seen": 4380664, + "step": 6725 + }, + { + "epoch": 3.9681603773584904, + "grad_norm": 1.3781172037124634, + "learning_rate": 9.708066914713205e-06, + "loss": 0.5348, + "num_input_tokens_seen": 4383800, + "step": 6730 + }, + { + "epoch": 3.9711084905660377, + "grad_norm": 3.0246148109436035, + "learning_rate": 9.707200068805764e-06, + "loss": 0.6916, + "num_input_tokens_seen": 4387576, + "step": 6735 + }, + { + "epoch": 3.974056603773585, + "grad_norm": 1.4945316314697266, + "learning_rate": 9.706331976646817e-06, + "loss": 0.3833, + "num_input_tokens_seen": 4391224, + "step": 6740 + }, + { + "epoch": 3.9770047169811322, + "grad_norm": 2.9747040271759033, + "learning_rate": 9.705462638466197e-06, + "loss": 0.4771, + "num_input_tokens_seen": 4394072, + "step": 6745 + }, + { + "epoch": 3.9799528301886795, + "grad_norm": 1.7769852876663208, + "learning_rate": 9.704592054494065e-06, + "loss": 0.4634, + "num_input_tokens_seen": 4397784, + "step": 6750 + }, + { + "epoch": 3.9829009433962264, + "grad_norm": 1.8301317691802979, + "learning_rate": 9.703720224960909e-06, + "loss": 0.56, + "num_input_tokens_seen": 4400984, + "step": 6755 + }, + { + "epoch": 3.9858490566037736, + "grad_norm": 1.7535650730133057, + "learning_rate": 9.702847150097552e-06, + "loss": 0.4941, + "num_input_tokens_seen": 4404760, + "step": 6760 + }, + { + "epoch": 3.9887971698113205, + "grad_norm": 2.4453303813934326, + "learning_rate": 9.701972830135143e-06, + "loss": 0.5114, + "num_input_tokens_seen": 4407928, + "step": 6765 + }, + { + "epoch": 3.9917452830188678, + "grad_norm": 1.5601730346679688, + "learning_rate": 9.701097265305164e-06, + "loss": 0.4297, + "num_input_tokens_seen": 4410552, + "step": 6770 + }, + { + "epoch": 3.994693396226415, + "grad_norm": 1.6432669162750244, + "learning_rate": 9.700220455839422e-06, + "loss": 0.4422, + "num_input_tokens_seen": 4414200, + "step": 6775 + }, + { + "epoch": 3.9976415094339623, + "grad_norm": 2.0757100582122803, + "learning_rate": 9.69934240197006e-06, + "loss": 0.4471, + "num_input_tokens_seen": 4417528, + "step": 6780 + }, + { + "epoch": 4.0, + "eval_loss": 0.5017365217208862, + "eval_runtime": 19.0297, + "eval_samples_per_second": 89.124, + "eval_steps_per_second": 22.281, + "num_input_tokens_seen": 4419816, + "step": 6784 + }, + { + "epoch": 4.00058962264151, + "grad_norm": 1.3792768716812134, + "learning_rate": 9.698463103929542e-06, + "loss": 0.4823, + "num_input_tokens_seen": 4420648, + "step": 6785 + }, + { + "epoch": 4.003537735849057, + "grad_norm": 1.2368861436843872, + "learning_rate": 9.697582561950669e-06, + "loss": 0.5052, + "num_input_tokens_seen": 4424936, + "step": 6790 + }, + { + "epoch": 4.006485849056604, + "grad_norm": 1.7510918378829956, + "learning_rate": 9.696700776266568e-06, + "loss": 0.3518, + "num_input_tokens_seen": 4428552, + "step": 6795 + }, + { + "epoch": 4.009433962264151, + "grad_norm": 2.8072891235351562, + "learning_rate": 9.695817747110694e-06, + "loss": 0.5587, + "num_input_tokens_seen": 4432008, + "step": 6800 + }, + { + "epoch": 4.012382075471698, + "grad_norm": 1.63905668258667, + "learning_rate": 9.694933474716831e-06, + "loss": 0.5803, + "num_input_tokens_seen": 4434664, + "step": 6805 + }, + { + "epoch": 4.015330188679245, + "grad_norm": 1.6267446279525757, + "learning_rate": 9.6940479593191e-06, + "loss": 0.5447, + "num_input_tokens_seen": 4437384, + "step": 6810 + }, + { + "epoch": 4.0182783018867925, + "grad_norm": 1.9597493410110474, + "learning_rate": 9.693161201151942e-06, + "loss": 0.4246, + "num_input_tokens_seen": 4439816, + "step": 6815 + }, + { + "epoch": 4.02122641509434, + "grad_norm": 1.4212974309921265, + "learning_rate": 9.692273200450128e-06, + "loss": 0.5792, + "num_input_tokens_seen": 4444392, + "step": 6820 + }, + { + "epoch": 4.024174528301887, + "grad_norm": 1.8943935632705688, + "learning_rate": 9.69138395744876e-06, + "loss": 0.4606, + "num_input_tokens_seen": 4447784, + "step": 6825 + }, + { + "epoch": 4.027122641509434, + "grad_norm": 2.747095823287964, + "learning_rate": 9.690493472383274e-06, + "loss": 0.454, + "num_input_tokens_seen": 4450728, + "step": 6830 + }, + { + "epoch": 4.030070754716981, + "grad_norm": 1.6090871095657349, + "learning_rate": 9.689601745489423e-06, + "loss": 0.4411, + "num_input_tokens_seen": 4454696, + "step": 6835 + }, + { + "epoch": 4.033018867924528, + "grad_norm": 1.3398933410644531, + "learning_rate": 9.6887087770033e-06, + "loss": 0.3798, + "num_input_tokens_seen": 4458472, + "step": 6840 + }, + { + "epoch": 4.035966981132075, + "grad_norm": 1.32926607131958, + "learning_rate": 9.687814567161322e-06, + "loss": 0.3346, + "num_input_tokens_seen": 4461320, + "step": 6845 + }, + { + "epoch": 4.038915094339623, + "grad_norm": 1.4502230882644653, + "learning_rate": 9.686919116200232e-06, + "loss": 0.429, + "num_input_tokens_seen": 4464488, + "step": 6850 + }, + { + "epoch": 4.04186320754717, + "grad_norm": 1.5732759237289429, + "learning_rate": 9.686022424357108e-06, + "loss": 0.3172, + "num_input_tokens_seen": 4468296, + "step": 6855 + }, + { + "epoch": 4.044811320754717, + "grad_norm": 1.155003547668457, + "learning_rate": 9.685124491869353e-06, + "loss": 0.4137, + "num_input_tokens_seen": 4471208, + "step": 6860 + }, + { + "epoch": 4.0477594339622645, + "grad_norm": 2.535128355026245, + "learning_rate": 9.684225318974696e-06, + "loss": 0.5474, + "num_input_tokens_seen": 4473992, + "step": 6865 + }, + { + "epoch": 4.050707547169812, + "grad_norm": 1.5267579555511475, + "learning_rate": 9.683324905911197e-06, + "loss": 0.4243, + "num_input_tokens_seen": 4477288, + "step": 6870 + }, + { + "epoch": 4.053655660377358, + "grad_norm": 1.2087308168411255, + "learning_rate": 9.682423252917245e-06, + "loss": 0.4147, + "num_input_tokens_seen": 4480840, + "step": 6875 + }, + { + "epoch": 4.056603773584905, + "grad_norm": 1.7894783020019531, + "learning_rate": 9.681520360231557e-06, + "loss": 0.3714, + "num_input_tokens_seen": 4484360, + "step": 6880 + }, + { + "epoch": 4.059551886792453, + "grad_norm": 1.3800687789916992, + "learning_rate": 9.680616228093178e-06, + "loss": 0.4868, + "num_input_tokens_seen": 4487368, + "step": 6885 + }, + { + "epoch": 4.0625, + "grad_norm": 1.6571564674377441, + "learning_rate": 9.67971085674148e-06, + "loss": 0.5315, + "num_input_tokens_seen": 4491016, + "step": 6890 + }, + { + "epoch": 4.065448113207547, + "grad_norm": 1.4735475778579712, + "learning_rate": 9.678804246416164e-06, + "loss": 0.4361, + "num_input_tokens_seen": 4494824, + "step": 6895 + }, + { + "epoch": 4.068396226415095, + "grad_norm": 2.686755418777466, + "learning_rate": 9.677896397357259e-06, + "loss": 0.4544, + "num_input_tokens_seen": 4497704, + "step": 6900 + }, + { + "epoch": 4.071344339622642, + "grad_norm": 1.6068991422653198, + "learning_rate": 9.676987309805121e-06, + "loss": 0.4551, + "num_input_tokens_seen": 4500808, + "step": 6905 + }, + { + "epoch": 4.074292452830188, + "grad_norm": 2.4640142917633057, + "learning_rate": 9.67607698400044e-06, + "loss": 0.5567, + "num_input_tokens_seen": 4504520, + "step": 6910 + }, + { + "epoch": 4.0772405660377355, + "grad_norm": 1.974229335784912, + "learning_rate": 9.67516542018422e-06, + "loss": 0.4292, + "num_input_tokens_seen": 4507400, + "step": 6915 + }, + { + "epoch": 4.080188679245283, + "grad_norm": 1.3728193044662476, + "learning_rate": 9.67425261859781e-06, + "loss": 0.4802, + "num_input_tokens_seen": 4510536, + "step": 6920 + }, + { + "epoch": 4.08313679245283, + "grad_norm": 2.1037697792053223, + "learning_rate": 9.673338579482871e-06, + "loss": 0.4879, + "num_input_tokens_seen": 4513608, + "step": 6925 + }, + { + "epoch": 4.086084905660377, + "grad_norm": 2.734231948852539, + "learning_rate": 9.672423303081404e-06, + "loss": 0.4857, + "num_input_tokens_seen": 4516744, + "step": 6930 + }, + { + "epoch": 4.089033018867925, + "grad_norm": 1.0829623937606812, + "learning_rate": 9.67150678963573e-06, + "loss": 0.4397, + "num_input_tokens_seen": 4519976, + "step": 6935 + }, + { + "epoch": 4.091981132075472, + "grad_norm": 1.3850733041763306, + "learning_rate": 9.670589039388501e-06, + "loss": 0.3999, + "num_input_tokens_seen": 4523016, + "step": 6940 + }, + { + "epoch": 4.094929245283019, + "grad_norm": 1.633005976676941, + "learning_rate": 9.669670052582695e-06, + "loss": 0.342, + "num_input_tokens_seen": 4526728, + "step": 6945 + }, + { + "epoch": 4.097877358490566, + "grad_norm": 1.788474678993225, + "learning_rate": 9.668749829461617e-06, + "loss": 0.492, + "num_input_tokens_seen": 4531688, + "step": 6950 + }, + { + "epoch": 4.100825471698113, + "grad_norm": 2.1207964420318604, + "learning_rate": 9.667828370268898e-06, + "loss": 0.41, + "num_input_tokens_seen": 4534632, + "step": 6955 + }, + { + "epoch": 4.10377358490566, + "grad_norm": 1.9003021717071533, + "learning_rate": 9.666905675248505e-06, + "loss": 0.3829, + "num_input_tokens_seen": 4538376, + "step": 6960 + }, + { + "epoch": 4.1067216981132075, + "grad_norm": 1.2470641136169434, + "learning_rate": 9.66598174464472e-06, + "loss": 0.4228, + "num_input_tokens_seen": 4542152, + "step": 6965 + }, + { + "epoch": 4.109669811320755, + "grad_norm": 1.5783416032791138, + "learning_rate": 9.665056578702157e-06, + "loss": 0.4163, + "num_input_tokens_seen": 4546120, + "step": 6970 + }, + { + "epoch": 4.112617924528302, + "grad_norm": 2.2033491134643555, + "learning_rate": 9.66413017766576e-06, + "loss": 0.6089, + "num_input_tokens_seen": 4550504, + "step": 6975 + }, + { + "epoch": 4.115566037735849, + "grad_norm": 2.0115277767181396, + "learning_rate": 9.663202541780799e-06, + "loss": 0.3702, + "num_input_tokens_seen": 4553384, + "step": 6980 + }, + { + "epoch": 4.118514150943396, + "grad_norm": 2.5282280445098877, + "learning_rate": 9.662273671292866e-06, + "loss": 0.3795, + "num_input_tokens_seen": 4556456, + "step": 6985 + }, + { + "epoch": 4.121462264150943, + "grad_norm": 1.7331839799880981, + "learning_rate": 9.661343566447886e-06, + "loss": 0.5846, + "num_input_tokens_seen": 4559112, + "step": 6990 + }, + { + "epoch": 4.12441037735849, + "grad_norm": 1.9574525356292725, + "learning_rate": 9.660412227492107e-06, + "loss": 0.6382, + "num_input_tokens_seen": 4563112, + "step": 6995 + }, + { + "epoch": 4.127358490566038, + "grad_norm": 2.7696988582611084, + "learning_rate": 9.659479654672106e-06, + "loss": 0.4016, + "num_input_tokens_seen": 4566120, + "step": 7000 + }, + { + "epoch": 4.130306603773585, + "grad_norm": 1.5588330030441284, + "learning_rate": 9.658545848234784e-06, + "loss": 0.4114, + "num_input_tokens_seen": 4568936, + "step": 7005 + }, + { + "epoch": 4.133254716981132, + "grad_norm": 2.5199122428894043, + "learning_rate": 9.657610808427372e-06, + "loss": 0.7058, + "num_input_tokens_seen": 4571944, + "step": 7010 + }, + { + "epoch": 4.1362028301886795, + "grad_norm": 2.1615517139434814, + "learning_rate": 9.656674535497425e-06, + "loss": 0.4962, + "num_input_tokens_seen": 4575624, + "step": 7015 + }, + { + "epoch": 4.139150943396227, + "grad_norm": 2.2504501342773438, + "learning_rate": 9.655737029692827e-06, + "loss": 0.5222, + "num_input_tokens_seen": 4578280, + "step": 7020 + }, + { + "epoch": 4.142099056603773, + "grad_norm": 1.3587526082992554, + "learning_rate": 9.654798291261785e-06, + "loss": 0.4279, + "num_input_tokens_seen": 4582024, + "step": 7025 + }, + { + "epoch": 4.1450471698113205, + "grad_norm": 2.7430689334869385, + "learning_rate": 9.653858320452833e-06, + "loss": 0.4952, + "num_input_tokens_seen": 4586344, + "step": 7030 + }, + { + "epoch": 4.147995283018868, + "grad_norm": 1.940010905265808, + "learning_rate": 9.652917117514836e-06, + "loss": 0.3986, + "num_input_tokens_seen": 4589576, + "step": 7035 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 1.2919933795928955, + "learning_rate": 9.651974682696975e-06, + "loss": 0.3703, + "num_input_tokens_seen": 4593736, + "step": 7040 + }, + { + "epoch": 4.153891509433962, + "grad_norm": 4.005589008331299, + "learning_rate": 9.651031016248773e-06, + "loss": 0.5401, + "num_input_tokens_seen": 4596072, + "step": 7045 + }, + { + "epoch": 4.15683962264151, + "grad_norm": 1.7463839054107666, + "learning_rate": 9.65008611842006e-06, + "loss": 0.3957, + "num_input_tokens_seen": 4599528, + "step": 7050 + }, + { + "epoch": 4.159787735849057, + "grad_norm": 2.3930132389068604, + "learning_rate": 9.64913998946101e-06, + "loss": 0.5267, + "num_input_tokens_seen": 4601736, + "step": 7055 + }, + { + "epoch": 4.162735849056604, + "grad_norm": 2.5286080837249756, + "learning_rate": 9.648192629622109e-06, + "loss": 0.4864, + "num_input_tokens_seen": 4606088, + "step": 7060 + }, + { + "epoch": 4.165683962264151, + "grad_norm": 2.717069625854492, + "learning_rate": 9.647244039154178e-06, + "loss": 0.452, + "num_input_tokens_seen": 4608712, + "step": 7065 + }, + { + "epoch": 4.168632075471698, + "grad_norm": 1.8588204383850098, + "learning_rate": 9.64629421830836e-06, + "loss": 0.5189, + "num_input_tokens_seen": 4611752, + "step": 7070 + }, + { + "epoch": 4.171580188679245, + "grad_norm": 1.5143080949783325, + "learning_rate": 9.64534316733612e-06, + "loss": 0.3933, + "num_input_tokens_seen": 4614184, + "step": 7075 + }, + { + "epoch": 4.1745283018867925, + "grad_norm": 1.2861015796661377, + "learning_rate": 9.644390886489258e-06, + "loss": 0.5269, + "num_input_tokens_seen": 4618856, + "step": 7080 + }, + { + "epoch": 4.17747641509434, + "grad_norm": 3.0115578174591064, + "learning_rate": 9.643437376019893e-06, + "loss": 0.5745, + "num_input_tokens_seen": 4622568, + "step": 7085 + }, + { + "epoch": 4.180424528301887, + "grad_norm": 5.937365531921387, + "learning_rate": 9.64248263618047e-06, + "loss": 0.4525, + "num_input_tokens_seen": 4626856, + "step": 7090 + }, + { + "epoch": 4.183372641509434, + "grad_norm": 2.178421974182129, + "learning_rate": 9.64152666722376e-06, + "loss": 0.4558, + "num_input_tokens_seen": 4631048, + "step": 7095 + }, + { + "epoch": 4.186320754716981, + "grad_norm": 1.7571662664413452, + "learning_rate": 9.640569469402863e-06, + "loss": 0.4706, + "num_input_tokens_seen": 4634088, + "step": 7100 + }, + { + "epoch": 4.189268867924528, + "grad_norm": 1.0506714582443237, + "learning_rate": 9.639611042971198e-06, + "loss": 0.4729, + "num_input_tokens_seen": 4638056, + "step": 7105 + }, + { + "epoch": 4.192216981132075, + "grad_norm": 4.74109411239624, + "learning_rate": 9.638651388182514e-06, + "loss": 0.5465, + "num_input_tokens_seen": 4640744, + "step": 7110 + }, + { + "epoch": 4.195165094339623, + "grad_norm": 2.675098419189453, + "learning_rate": 9.637690505290884e-06, + "loss": 0.5278, + "num_input_tokens_seen": 4643432, + "step": 7115 + }, + { + "epoch": 4.19811320754717, + "grad_norm": 3.565408229827881, + "learning_rate": 9.636728394550705e-06, + "loss": 0.473, + "num_input_tokens_seen": 4646184, + "step": 7120 + }, + { + "epoch": 4.201061320754717, + "grad_norm": 3.1837704181671143, + "learning_rate": 9.6357650562167e-06, + "loss": 0.4971, + "num_input_tokens_seen": 4649064, + "step": 7125 + }, + { + "epoch": 4.2040094339622645, + "grad_norm": 3.152163028717041, + "learning_rate": 9.634800490543918e-06, + "loss": 0.4081, + "num_input_tokens_seen": 4652360, + "step": 7130 + }, + { + "epoch": 4.206957547169812, + "grad_norm": 2.869594097137451, + "learning_rate": 9.633834697787731e-06, + "loss": 0.4657, + "num_input_tokens_seen": 4655080, + "step": 7135 + }, + { + "epoch": 4.209905660377358, + "grad_norm": 2.0781044960021973, + "learning_rate": 9.632867678203836e-06, + "loss": 0.519, + "num_input_tokens_seen": 4657384, + "step": 7140 + }, + { + "epoch": 4.212853773584905, + "grad_norm": 1.5709443092346191, + "learning_rate": 9.631899432048258e-06, + "loss": 0.3374, + "num_input_tokens_seen": 4660296, + "step": 7145 + }, + { + "epoch": 4.215801886792453, + "grad_norm": 1.8899904489517212, + "learning_rate": 9.630929959577343e-06, + "loss": 0.5037, + "num_input_tokens_seen": 4664360, + "step": 7150 + }, + { + "epoch": 4.21875, + "grad_norm": 1.7032573223114014, + "learning_rate": 9.629959261047764e-06, + "loss": 0.448, + "num_input_tokens_seen": 4668360, + "step": 7155 + }, + { + "epoch": 4.221698113207547, + "grad_norm": 1.3508163690567017, + "learning_rate": 9.628987336716513e-06, + "loss": 0.5411, + "num_input_tokens_seen": 4671528, + "step": 7160 + }, + { + "epoch": 4.224646226415095, + "grad_norm": 4.669062614440918, + "learning_rate": 9.628014186840918e-06, + "loss": 0.4886, + "num_input_tokens_seen": 4675016, + "step": 7165 + }, + { + "epoch": 4.227594339622642, + "grad_norm": 1.9774128198623657, + "learning_rate": 9.62703981167862e-06, + "loss": 0.4901, + "num_input_tokens_seen": 4677768, + "step": 7170 + }, + { + "epoch": 4.230542452830188, + "grad_norm": 1.5904439687728882, + "learning_rate": 9.626064211487592e-06, + "loss": 0.4378, + "num_input_tokens_seen": 4681064, + "step": 7175 + }, + { + "epoch": 4.2334905660377355, + "grad_norm": 2.5728485584259033, + "learning_rate": 9.625087386526125e-06, + "loss": 0.5005, + "num_input_tokens_seen": 4683784, + "step": 7180 + }, + { + "epoch": 4.236438679245283, + "grad_norm": 1.2340127229690552, + "learning_rate": 9.624109337052839e-06, + "loss": 0.4803, + "num_input_tokens_seen": 4687080, + "step": 7185 + }, + { + "epoch": 4.23938679245283, + "grad_norm": 2.733752489089966, + "learning_rate": 9.623130063326678e-06, + "loss": 0.4852, + "num_input_tokens_seen": 4689544, + "step": 7190 + }, + { + "epoch": 4.242334905660377, + "grad_norm": 1.2143913507461548, + "learning_rate": 9.622149565606909e-06, + "loss": 0.4812, + "num_input_tokens_seen": 4693640, + "step": 7195 + }, + { + "epoch": 4.245283018867925, + "grad_norm": 1.6723196506500244, + "learning_rate": 9.621167844153122e-06, + "loss": 0.4102, + "num_input_tokens_seen": 4696616, + "step": 7200 + }, + { + "epoch": 4.248231132075472, + "grad_norm": 2.3170886039733887, + "learning_rate": 9.620184899225231e-06, + "loss": 0.557, + "num_input_tokens_seen": 4700904, + "step": 7205 + }, + { + "epoch": 4.251179245283019, + "grad_norm": 1.6726902723312378, + "learning_rate": 9.619200731083477e-06, + "loss": 0.507, + "num_input_tokens_seen": 4703912, + "step": 7210 + }, + { + "epoch": 4.254127358490566, + "grad_norm": 0.8947533369064331, + "learning_rate": 9.618215339988422e-06, + "loss": 0.6216, + "num_input_tokens_seen": 4707048, + "step": 7215 + }, + { + "epoch": 4.257075471698113, + "grad_norm": 2.406144857406616, + "learning_rate": 9.617228726200951e-06, + "loss": 0.4979, + "num_input_tokens_seen": 4710312, + "step": 7220 + }, + { + "epoch": 4.26002358490566, + "grad_norm": 2.2040109634399414, + "learning_rate": 9.616240889982277e-06, + "loss": 0.384, + "num_input_tokens_seen": 4713384, + "step": 7225 + }, + { + "epoch": 4.2629716981132075, + "grad_norm": 2.256894588470459, + "learning_rate": 9.61525183159393e-06, + "loss": 0.4548, + "num_input_tokens_seen": 4716168, + "step": 7230 + }, + { + "epoch": 4.265919811320755, + "grad_norm": 2.1924822330474854, + "learning_rate": 9.614261551297774e-06, + "loss": 0.3812, + "num_input_tokens_seen": 4719368, + "step": 7235 + }, + { + "epoch": 4.268867924528302, + "grad_norm": 2.008199453353882, + "learning_rate": 9.613270049355983e-06, + "loss": 0.4746, + "num_input_tokens_seen": 4722536, + "step": 7240 + }, + { + "epoch": 4.271816037735849, + "grad_norm": 2.0688059329986572, + "learning_rate": 9.612277326031065e-06, + "loss": 0.4638, + "num_input_tokens_seen": 4726056, + "step": 7245 + }, + { + "epoch": 4.274764150943396, + "grad_norm": 2.481250762939453, + "learning_rate": 9.611283381585848e-06, + "loss": 0.4333, + "num_input_tokens_seen": 4728712, + "step": 7250 + }, + { + "epoch": 4.277712264150943, + "grad_norm": 1.925898790359497, + "learning_rate": 9.61028821628348e-06, + "loss": 0.4202, + "num_input_tokens_seen": 4732328, + "step": 7255 + }, + { + "epoch": 4.28066037735849, + "grad_norm": 2.0609130859375, + "learning_rate": 9.609291830387439e-06, + "loss": 0.7485, + "num_input_tokens_seen": 4735240, + "step": 7260 + }, + { + "epoch": 4.283608490566038, + "grad_norm": 2.0005359649658203, + "learning_rate": 9.608294224161523e-06, + "loss": 0.3855, + "num_input_tokens_seen": 4739112, + "step": 7265 + }, + { + "epoch": 4.286556603773585, + "grad_norm": 1.4678510427474976, + "learning_rate": 9.607295397869847e-06, + "loss": 0.5807, + "num_input_tokens_seen": 4741640, + "step": 7270 + }, + { + "epoch": 4.289504716981132, + "grad_norm": 3.272409200668335, + "learning_rate": 9.60629535177686e-06, + "loss": 0.5031, + "num_input_tokens_seen": 4744104, + "step": 7275 + }, + { + "epoch": 4.2924528301886795, + "grad_norm": 1.7416832447052002, + "learning_rate": 9.605294086147325e-06, + "loss": 0.6573, + "num_input_tokens_seen": 4747880, + "step": 7280 + }, + { + "epoch": 4.295400943396227, + "grad_norm": 2.107044219970703, + "learning_rate": 9.604291601246333e-06, + "loss": 0.5157, + "num_input_tokens_seen": 4750728, + "step": 7285 + }, + { + "epoch": 4.298349056603773, + "grad_norm": 1.8313056230545044, + "learning_rate": 9.603287897339299e-06, + "loss": 0.3689, + "num_input_tokens_seen": 4754088, + "step": 7290 + }, + { + "epoch": 4.3012971698113205, + "grad_norm": 1.8223328590393066, + "learning_rate": 9.602282974691953e-06, + "loss": 0.3235, + "num_input_tokens_seen": 4757256, + "step": 7295 + }, + { + "epoch": 4.304245283018868, + "grad_norm": 2.2340760231018066, + "learning_rate": 9.601276833570355e-06, + "loss": 0.5108, + "num_input_tokens_seen": 4760264, + "step": 7300 + }, + { + "epoch": 4.307193396226415, + "grad_norm": 2.3296284675598145, + "learning_rate": 9.600269474240885e-06, + "loss": 0.3417, + "num_input_tokens_seen": 4762440, + "step": 7305 + }, + { + "epoch": 4.310141509433962, + "grad_norm": 2.4569525718688965, + "learning_rate": 9.599260896970246e-06, + "loss": 0.6009, + "num_input_tokens_seen": 4766600, + "step": 7310 + }, + { + "epoch": 4.31308962264151, + "grad_norm": 3.064113140106201, + "learning_rate": 9.598251102025463e-06, + "loss": 0.3808, + "num_input_tokens_seen": 4769320, + "step": 7315 + }, + { + "epoch": 4.316037735849057, + "grad_norm": 2.808861494064331, + "learning_rate": 9.597240089673882e-06, + "loss": 0.5002, + "num_input_tokens_seen": 4772136, + "step": 7320 + }, + { + "epoch": 4.318985849056604, + "grad_norm": 3.154987096786499, + "learning_rate": 9.596227860183175e-06, + "loss": 0.4178, + "num_input_tokens_seen": 4774920, + "step": 7325 + }, + { + "epoch": 4.321933962264151, + "grad_norm": 2.1308095455169678, + "learning_rate": 9.595214413821334e-06, + "loss": 0.3371, + "num_input_tokens_seen": 4778632, + "step": 7330 + }, + { + "epoch": 4.324882075471698, + "grad_norm": 2.2511134147644043, + "learning_rate": 9.59419975085667e-06, + "loss": 0.5201, + "num_input_tokens_seen": 4782248, + "step": 7335 + }, + { + "epoch": 4.327830188679245, + "grad_norm": 2.4492440223693848, + "learning_rate": 9.593183871557826e-06, + "loss": 0.4916, + "num_input_tokens_seen": 4785256, + "step": 7340 + }, + { + "epoch": 4.3307783018867925, + "grad_norm": 1.8863738775253296, + "learning_rate": 9.592166776193754e-06, + "loss": 0.3618, + "num_input_tokens_seen": 4788360, + "step": 7345 + }, + { + "epoch": 4.33372641509434, + "grad_norm": 2.3676235675811768, + "learning_rate": 9.591148465033738e-06, + "loss": 0.4221, + "num_input_tokens_seen": 4791848, + "step": 7350 + }, + { + "epoch": 4.336674528301887, + "grad_norm": 2.252833127975464, + "learning_rate": 9.590128938347378e-06, + "loss": 0.4216, + "num_input_tokens_seen": 4794760, + "step": 7355 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 1.0753871202468872, + "learning_rate": 9.589108196404599e-06, + "loss": 0.4328, + "num_input_tokens_seen": 4797608, + "step": 7360 + }, + { + "epoch": 4.342570754716981, + "grad_norm": 2.2163984775543213, + "learning_rate": 9.588086239475649e-06, + "loss": 0.4152, + "num_input_tokens_seen": 4801224, + "step": 7365 + }, + { + "epoch": 4.345518867924528, + "grad_norm": 2.5468742847442627, + "learning_rate": 9.587063067831092e-06, + "loss": 0.4721, + "num_input_tokens_seen": 4804488, + "step": 7370 + }, + { + "epoch": 4.348466981132075, + "grad_norm": 1.7193576097488403, + "learning_rate": 9.586038681741818e-06, + "loss": 0.4086, + "num_input_tokens_seen": 4806984, + "step": 7375 + }, + { + "epoch": 4.351415094339623, + "grad_norm": 1.8865504264831543, + "learning_rate": 9.58501308147904e-06, + "loss": 0.6139, + "num_input_tokens_seen": 4810280, + "step": 7380 + }, + { + "epoch": 4.35436320754717, + "grad_norm": 1.2079781293869019, + "learning_rate": 9.583986267314288e-06, + "loss": 0.4688, + "num_input_tokens_seen": 4813320, + "step": 7385 + }, + { + "epoch": 4.357311320754717, + "grad_norm": 1.7007620334625244, + "learning_rate": 9.582958239519416e-06, + "loss": 0.5089, + "num_input_tokens_seen": 4816296, + "step": 7390 + }, + { + "epoch": 4.3602594339622645, + "grad_norm": 2.441124200820923, + "learning_rate": 9.581928998366597e-06, + "loss": 0.5664, + "num_input_tokens_seen": 4819176, + "step": 7395 + }, + { + "epoch": 4.363207547169811, + "grad_norm": 1.5746839046478271, + "learning_rate": 9.58089854412833e-06, + "loss": 0.3879, + "num_input_tokens_seen": 4822632, + "step": 7400 + }, + { + "epoch": 4.366155660377358, + "grad_norm": 2.8169047832489014, + "learning_rate": 9.579866877077431e-06, + "loss": 0.475, + "num_input_tokens_seen": 4825480, + "step": 7405 + }, + { + "epoch": 4.369103773584905, + "grad_norm": 1.3569514751434326, + "learning_rate": 9.578833997487038e-06, + "loss": 0.4442, + "num_input_tokens_seen": 4828712, + "step": 7410 + }, + { + "epoch": 4.372051886792453, + "grad_norm": 2.8722033500671387, + "learning_rate": 9.57779990563061e-06, + "loss": 0.418, + "num_input_tokens_seen": 4831944, + "step": 7415 + }, + { + "epoch": 4.375, + "grad_norm": 2.1287319660186768, + "learning_rate": 9.576764601781928e-06, + "loss": 0.413, + "num_input_tokens_seen": 4835528, + "step": 7420 + }, + { + "epoch": 4.377948113207547, + "grad_norm": 2.891575574874878, + "learning_rate": 9.575728086215093e-06, + "loss": 0.4642, + "num_input_tokens_seen": 4838152, + "step": 7425 + }, + { + "epoch": 4.380896226415095, + "grad_norm": 6.918198585510254, + "learning_rate": 9.574690359204527e-06, + "loss": 0.5152, + "num_input_tokens_seen": 4840520, + "step": 7430 + }, + { + "epoch": 4.383844339622642, + "grad_norm": 1.0360878705978394, + "learning_rate": 9.573651421024972e-06, + "loss": 0.3794, + "num_input_tokens_seen": 4845064, + "step": 7435 + }, + { + "epoch": 4.386792452830189, + "grad_norm": 3.0622286796569824, + "learning_rate": 9.572611271951494e-06, + "loss": 0.5777, + "num_input_tokens_seen": 4847464, + "step": 7440 + }, + { + "epoch": 4.3897405660377355, + "grad_norm": 2.212538957595825, + "learning_rate": 9.571569912259473e-06, + "loss": 0.5316, + "num_input_tokens_seen": 4850728, + "step": 7445 + }, + { + "epoch": 4.392688679245283, + "grad_norm": 1.8893916606903076, + "learning_rate": 9.570527342224614e-06, + "loss": 0.4438, + "num_input_tokens_seen": 4854440, + "step": 7450 + }, + { + "epoch": 4.39563679245283, + "grad_norm": 3.0040769577026367, + "learning_rate": 9.569483562122945e-06, + "loss": 0.499, + "num_input_tokens_seen": 4857960, + "step": 7455 + }, + { + "epoch": 4.398584905660377, + "grad_norm": 1.3081634044647217, + "learning_rate": 9.568438572230811e-06, + "loss": 0.3592, + "num_input_tokens_seen": 4860840, + "step": 7460 + }, + { + "epoch": 4.401533018867925, + "grad_norm": 1.9627350568771362, + "learning_rate": 9.567392372824873e-06, + "loss": 0.4627, + "num_input_tokens_seen": 4863240, + "step": 7465 + }, + { + "epoch": 4.404481132075472, + "grad_norm": 2.4167428016662598, + "learning_rate": 9.566344964182123e-06, + "loss": 0.4864, + "num_input_tokens_seen": 4866248, + "step": 7470 + }, + { + "epoch": 4.407429245283019, + "grad_norm": 3.1545193195343018, + "learning_rate": 9.565296346579862e-06, + "loss": 0.3894, + "num_input_tokens_seen": 4869256, + "step": 7475 + }, + { + "epoch": 4.410377358490566, + "grad_norm": 4.5815815925598145, + "learning_rate": 9.564246520295719e-06, + "loss": 0.5238, + "num_input_tokens_seen": 4871912, + "step": 7480 + }, + { + "epoch": 4.413325471698113, + "grad_norm": 1.6332738399505615, + "learning_rate": 9.563195485607638e-06, + "loss": 0.4309, + "num_input_tokens_seen": 4875944, + "step": 7485 + }, + { + "epoch": 4.41627358490566, + "grad_norm": 1.6208539009094238, + "learning_rate": 9.562143242793885e-06, + "loss": 0.4546, + "num_input_tokens_seen": 4878280, + "step": 7490 + }, + { + "epoch": 4.4192216981132075, + "grad_norm": 2.425055503845215, + "learning_rate": 9.561089792133048e-06, + "loss": 0.4125, + "num_input_tokens_seen": 4881352, + "step": 7495 + }, + { + "epoch": 4.422169811320755, + "grad_norm": 1.652378797531128, + "learning_rate": 9.560035133904031e-06, + "loss": 0.499, + "num_input_tokens_seen": 4884808, + "step": 7500 + }, + { + "epoch": 4.425117924528302, + "grad_norm": 1.2958364486694336, + "learning_rate": 9.55897926838606e-06, + "loss": 0.4404, + "num_input_tokens_seen": 4888040, + "step": 7505 + }, + { + "epoch": 4.428066037735849, + "grad_norm": 2.2129578590393066, + "learning_rate": 9.55792219585868e-06, + "loss": 0.4059, + "num_input_tokens_seen": 4891144, + "step": 7510 + }, + { + "epoch": 4.431014150943396, + "grad_norm": 2.1508052349090576, + "learning_rate": 9.556863916601754e-06, + "loss": 0.4684, + "num_input_tokens_seen": 4894344, + "step": 7515 + }, + { + "epoch": 4.433962264150943, + "grad_norm": 1.6770113706588745, + "learning_rate": 9.555804430895467e-06, + "loss": 0.5369, + "num_input_tokens_seen": 4897352, + "step": 7520 + }, + { + "epoch": 4.43691037735849, + "grad_norm": 1.7450464963912964, + "learning_rate": 9.554743739020325e-06, + "loss": 0.3056, + "num_input_tokens_seen": 4900040, + "step": 7525 + }, + { + "epoch": 4.439858490566038, + "grad_norm": 1.5154356956481934, + "learning_rate": 9.553681841257146e-06, + "loss": 0.5413, + "num_input_tokens_seen": 4903560, + "step": 7530 + }, + { + "epoch": 4.442806603773585, + "grad_norm": 2.1468966007232666, + "learning_rate": 9.552618737887073e-06, + "loss": 0.4726, + "num_input_tokens_seen": 4907496, + "step": 7535 + }, + { + "epoch": 4.445754716981132, + "grad_norm": 2.379988193511963, + "learning_rate": 9.55155442919157e-06, + "loss": 0.5243, + "num_input_tokens_seen": 4910376, + "step": 7540 + }, + { + "epoch": 4.4487028301886795, + "grad_norm": 2.343595266342163, + "learning_rate": 9.550488915452416e-06, + "loss": 0.4509, + "num_input_tokens_seen": 4913512, + "step": 7545 + }, + { + "epoch": 4.451650943396227, + "grad_norm": 2.2828774452209473, + "learning_rate": 9.54942219695171e-06, + "loss": 0.5485, + "num_input_tokens_seen": 4916264, + "step": 7550 + }, + { + "epoch": 4.454599056603773, + "grad_norm": 1.4305038452148438, + "learning_rate": 9.54835427397187e-06, + "loss": 0.4777, + "num_input_tokens_seen": 4920200, + "step": 7555 + }, + { + "epoch": 4.4575471698113205, + "grad_norm": 2.3910508155822754, + "learning_rate": 9.547285146795634e-06, + "loss": 0.5433, + "num_input_tokens_seen": 4923304, + "step": 7560 + }, + { + "epoch": 4.460495283018868, + "grad_norm": 2.9457247257232666, + "learning_rate": 9.546214815706059e-06, + "loss": 0.5329, + "num_input_tokens_seen": 4926312, + "step": 7565 + }, + { + "epoch": 4.463443396226415, + "grad_norm": 4.562378406524658, + "learning_rate": 9.545143280986518e-06, + "loss": 0.3715, + "num_input_tokens_seen": 4930280, + "step": 7570 + }, + { + "epoch": 4.466391509433962, + "grad_norm": 1.3466272354125977, + "learning_rate": 9.544070542920703e-06, + "loss": 0.5294, + "num_input_tokens_seen": 4934024, + "step": 7575 + }, + { + "epoch": 4.46933962264151, + "grad_norm": 1.6014982461929321, + "learning_rate": 9.542996601792629e-06, + "loss": 0.552, + "num_input_tokens_seen": 4937096, + "step": 7580 + }, + { + "epoch": 4.472287735849057, + "grad_norm": 4.151698589324951, + "learning_rate": 9.541921457886624e-06, + "loss": 0.5132, + "num_input_tokens_seen": 4939848, + "step": 7585 + }, + { + "epoch": 4.475235849056604, + "grad_norm": 2.1281020641326904, + "learning_rate": 9.54084511148734e-06, + "loss": 0.4083, + "num_input_tokens_seen": 4942696, + "step": 7590 + }, + { + "epoch": 4.478183962264151, + "grad_norm": 2.5933918952941895, + "learning_rate": 9.539767562879742e-06, + "loss": 0.4343, + "num_input_tokens_seen": 4945704, + "step": 7595 + }, + { + "epoch": 4.481132075471698, + "grad_norm": 2.99198579788208, + "learning_rate": 9.538688812349118e-06, + "loss": 0.5065, + "num_input_tokens_seen": 4948680, + "step": 7600 + }, + { + "epoch": 4.484080188679245, + "grad_norm": 1.916102647781372, + "learning_rate": 9.537608860181069e-06, + "loss": 0.437, + "num_input_tokens_seen": 4951688, + "step": 7605 + }, + { + "epoch": 4.4870283018867925, + "grad_norm": 1.8582398891448975, + "learning_rate": 9.536527706661519e-06, + "loss": 0.5244, + "num_input_tokens_seen": 4955752, + "step": 7610 + }, + { + "epoch": 4.48997641509434, + "grad_norm": 1.8352452516555786, + "learning_rate": 9.535445352076707e-06, + "loss": 0.3742, + "num_input_tokens_seen": 4958888, + "step": 7615 + }, + { + "epoch": 4.492924528301887, + "grad_norm": 1.9488977193832397, + "learning_rate": 9.534361796713191e-06, + "loss": 0.4895, + "num_input_tokens_seen": 4961992, + "step": 7620 + }, + { + "epoch": 4.495872641509434, + "grad_norm": 2.2810184955596924, + "learning_rate": 9.533277040857847e-06, + "loss": 0.4034, + "num_input_tokens_seen": 4964616, + "step": 7625 + }, + { + "epoch": 4.498820754716981, + "grad_norm": 2.05924129486084, + "learning_rate": 9.53219108479787e-06, + "loss": 0.3751, + "num_input_tokens_seen": 4967912, + "step": 7630 + }, + { + "epoch": 4.501768867924528, + "grad_norm": 2.593048334121704, + "learning_rate": 9.53110392882077e-06, + "loss": 0.2893, + "num_input_tokens_seen": 4970280, + "step": 7635 + }, + { + "epoch": 4.504716981132075, + "grad_norm": 1.8708491325378418, + "learning_rate": 9.530015573214378e-06, + "loss": 0.467, + "num_input_tokens_seen": 4973224, + "step": 7640 + }, + { + "epoch": 4.507665094339623, + "grad_norm": 1.2913099527359009, + "learning_rate": 9.528926018266837e-06, + "loss": 0.5468, + "num_input_tokens_seen": 4976808, + "step": 7645 + }, + { + "epoch": 4.51061320754717, + "grad_norm": 1.45063316822052, + "learning_rate": 9.527835264266617e-06, + "loss": 0.5315, + "num_input_tokens_seen": 4982312, + "step": 7650 + }, + { + "epoch": 4.513561320754717, + "grad_norm": 1.5617191791534424, + "learning_rate": 9.526743311502496e-06, + "loss": 0.4115, + "num_input_tokens_seen": 4984744, + "step": 7655 + }, + { + "epoch": 4.5165094339622645, + "grad_norm": 2.0885818004608154, + "learning_rate": 9.525650160263573e-06, + "loss": 0.4663, + "num_input_tokens_seen": 4987816, + "step": 7660 + }, + { + "epoch": 4.519457547169811, + "grad_norm": 1.497378945350647, + "learning_rate": 9.524555810839267e-06, + "loss": 0.5458, + "num_input_tokens_seen": 4990856, + "step": 7665 + }, + { + "epoch": 4.522405660377358, + "grad_norm": 2.121843099594116, + "learning_rate": 9.523460263519309e-06, + "loss": 0.3345, + "num_input_tokens_seen": 4993384, + "step": 7670 + }, + { + "epoch": 4.525353773584905, + "grad_norm": 1.7966277599334717, + "learning_rate": 9.522363518593753e-06, + "loss": 0.4313, + "num_input_tokens_seen": 4996872, + "step": 7675 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 1.5318876504898071, + "learning_rate": 9.521265576352963e-06, + "loss": 0.4791, + "num_input_tokens_seen": 4999304, + "step": 7680 + }, + { + "epoch": 4.53125, + "grad_norm": 1.2461329698562622, + "learning_rate": 9.520166437087628e-06, + "loss": 0.5386, + "num_input_tokens_seen": 5002760, + "step": 7685 + }, + { + "epoch": 4.534198113207547, + "grad_norm": 1.2866296768188477, + "learning_rate": 9.519066101088748e-06, + "loss": 0.3854, + "num_input_tokens_seen": 5005704, + "step": 7690 + }, + { + "epoch": 4.537146226415095, + "grad_norm": 1.8826137781143188, + "learning_rate": 9.51796456864764e-06, + "loss": 0.3951, + "num_input_tokens_seen": 5008616, + "step": 7695 + }, + { + "epoch": 4.540094339622642, + "grad_norm": 1.1628481149673462, + "learning_rate": 9.516861840055942e-06, + "loss": 0.4036, + "num_input_tokens_seen": 5011624, + "step": 7700 + }, + { + "epoch": 4.543042452830189, + "grad_norm": 2.1325292587280273, + "learning_rate": 9.515757915605604e-06, + "loss": 0.5455, + "num_input_tokens_seen": 5014760, + "step": 7705 + }, + { + "epoch": 4.5459905660377355, + "grad_norm": 2.718200922012329, + "learning_rate": 9.514652795588899e-06, + "loss": 0.4415, + "num_input_tokens_seen": 5018184, + "step": 7710 + }, + { + "epoch": 4.548938679245283, + "grad_norm": 1.7247861623764038, + "learning_rate": 9.513546480298405e-06, + "loss": 0.5423, + "num_input_tokens_seen": 5021448, + "step": 7715 + }, + { + "epoch": 4.55188679245283, + "grad_norm": 1.761030673980713, + "learning_rate": 9.512438970027032e-06, + "loss": 0.4206, + "num_input_tokens_seen": 5024456, + "step": 7720 + }, + { + "epoch": 4.554834905660377, + "grad_norm": 1.4829206466674805, + "learning_rate": 9.511330265067992e-06, + "loss": 0.4736, + "num_input_tokens_seen": 5026792, + "step": 7725 + }, + { + "epoch": 4.557783018867925, + "grad_norm": 1.4282112121582031, + "learning_rate": 9.510220365714822e-06, + "loss": 0.3369, + "num_input_tokens_seen": 5029128, + "step": 7730 + }, + { + "epoch": 4.560731132075472, + "grad_norm": 1.5470753908157349, + "learning_rate": 9.509109272261373e-06, + "loss": 0.3452, + "num_input_tokens_seen": 5033064, + "step": 7735 + }, + { + "epoch": 4.563679245283019, + "grad_norm": 1.2875882387161255, + "learning_rate": 9.50799698500181e-06, + "loss": 0.3933, + "num_input_tokens_seen": 5036392, + "step": 7740 + }, + { + "epoch": 4.566627358490566, + "grad_norm": 2.0441648960113525, + "learning_rate": 9.506883504230618e-06, + "loss": 0.4035, + "num_input_tokens_seen": 5040296, + "step": 7745 + }, + { + "epoch": 4.569575471698113, + "grad_norm": 1.0374459028244019, + "learning_rate": 9.505768830242593e-06, + "loss": 0.4266, + "num_input_tokens_seen": 5043688, + "step": 7750 + }, + { + "epoch": 4.57252358490566, + "grad_norm": 2.1011946201324463, + "learning_rate": 9.504652963332852e-06, + "loss": 0.489, + "num_input_tokens_seen": 5046152, + "step": 7755 + }, + { + "epoch": 4.5754716981132075, + "grad_norm": 2.1412925720214844, + "learning_rate": 9.503535903796825e-06, + "loss": 0.496, + "num_input_tokens_seen": 5048360, + "step": 7760 + }, + { + "epoch": 4.578419811320755, + "grad_norm": 2.4934630393981934, + "learning_rate": 9.50241765193026e-06, + "loss": 0.3689, + "num_input_tokens_seen": 5051112, + "step": 7765 + }, + { + "epoch": 4.581367924528302, + "grad_norm": 5.191808223724365, + "learning_rate": 9.501298208029214e-06, + "loss": 0.4181, + "num_input_tokens_seen": 5053480, + "step": 7770 + }, + { + "epoch": 4.584316037735849, + "grad_norm": 1.1557424068450928, + "learning_rate": 9.500177572390071e-06, + "loss": 0.3971, + "num_input_tokens_seen": 5057000, + "step": 7775 + }, + { + "epoch": 4.587264150943396, + "grad_norm": 2.4938762187957764, + "learning_rate": 9.49905574530952e-06, + "loss": 0.4833, + "num_input_tokens_seen": 5060072, + "step": 7780 + }, + { + "epoch": 4.590212264150943, + "grad_norm": 4.864057540893555, + "learning_rate": 9.497932727084571e-06, + "loss": 0.626, + "num_input_tokens_seen": 5062568, + "step": 7785 + }, + { + "epoch": 4.59316037735849, + "grad_norm": 1.5662068128585815, + "learning_rate": 9.496808518012545e-06, + "loss": 0.4124, + "num_input_tokens_seen": 5066600, + "step": 7790 + }, + { + "epoch": 4.596108490566038, + "grad_norm": 1.232354760169983, + "learning_rate": 9.495683118391087e-06, + "loss": 0.4609, + "num_input_tokens_seen": 5069384, + "step": 7795 + }, + { + "epoch": 4.599056603773585, + "grad_norm": 1.6870357990264893, + "learning_rate": 9.494556528518146e-06, + "loss": 0.4144, + "num_input_tokens_seen": 5072520, + "step": 7800 + }, + { + "epoch": 4.602004716981132, + "grad_norm": 5.364602088928223, + "learning_rate": 9.493428748691995e-06, + "loss": 0.5683, + "num_input_tokens_seen": 5075208, + "step": 7805 + }, + { + "epoch": 4.6049528301886795, + "grad_norm": 1.4015411138534546, + "learning_rate": 9.492299779211215e-06, + "loss": 0.3866, + "num_input_tokens_seen": 5077992, + "step": 7810 + }, + { + "epoch": 4.607900943396227, + "grad_norm": 1.987358570098877, + "learning_rate": 9.49116962037471e-06, + "loss": 0.6076, + "num_input_tokens_seen": 5080456, + "step": 7815 + }, + { + "epoch": 4.610849056603773, + "grad_norm": 2.5312421321868896, + "learning_rate": 9.490038272481691e-06, + "loss": 0.4501, + "num_input_tokens_seen": 5084168, + "step": 7820 + }, + { + "epoch": 4.6137971698113205, + "grad_norm": 4.634452819824219, + "learning_rate": 9.488905735831689e-06, + "loss": 0.492, + "num_input_tokens_seen": 5086824, + "step": 7825 + }, + { + "epoch": 4.616745283018868, + "grad_norm": 3.1330456733703613, + "learning_rate": 9.487772010724548e-06, + "loss": 0.3705, + "num_input_tokens_seen": 5089928, + "step": 7830 + }, + { + "epoch": 4.619693396226415, + "grad_norm": 1.1904127597808838, + "learning_rate": 9.486637097460425e-06, + "loss": 0.3599, + "num_input_tokens_seen": 5093928, + "step": 7835 + }, + { + "epoch": 4.622641509433962, + "grad_norm": 2.6280577182769775, + "learning_rate": 9.485500996339793e-06, + "loss": 0.404, + "num_input_tokens_seen": 5096680, + "step": 7840 + }, + { + "epoch": 4.62558962264151, + "grad_norm": 1.6727854013442993, + "learning_rate": 9.484363707663443e-06, + "loss": 0.4298, + "num_input_tokens_seen": 5099496, + "step": 7845 + }, + { + "epoch": 4.628537735849057, + "grad_norm": 4.714191913604736, + "learning_rate": 9.483225231732474e-06, + "loss": 0.5356, + "num_input_tokens_seen": 5102472, + "step": 7850 + }, + { + "epoch": 4.631485849056604, + "grad_norm": 1.9147136211395264, + "learning_rate": 9.482085568848302e-06, + "loss": 0.4106, + "num_input_tokens_seen": 5105768, + "step": 7855 + }, + { + "epoch": 4.634433962264151, + "grad_norm": 1.8578300476074219, + "learning_rate": 9.480944719312659e-06, + "loss": 0.6216, + "num_input_tokens_seen": 5109224, + "step": 7860 + }, + { + "epoch": 4.637382075471698, + "grad_norm": 3.6490964889526367, + "learning_rate": 9.47980268342759e-06, + "loss": 0.5203, + "num_input_tokens_seen": 5112392, + "step": 7865 + }, + { + "epoch": 4.640330188679245, + "grad_norm": 2.039979934692383, + "learning_rate": 9.478659461495456e-06, + "loss": 0.4738, + "num_input_tokens_seen": 5114376, + "step": 7870 + }, + { + "epoch": 4.6432783018867925, + "grad_norm": 1.3141827583312988, + "learning_rate": 9.477515053818926e-06, + "loss": 0.4424, + "num_input_tokens_seen": 5118536, + "step": 7875 + }, + { + "epoch": 4.64622641509434, + "grad_norm": 3.704115867614746, + "learning_rate": 9.476369460700988e-06, + "loss": 0.4267, + "num_input_tokens_seen": 5121736, + "step": 7880 + }, + { + "epoch": 4.649174528301887, + "grad_norm": 2.4232606887817383, + "learning_rate": 9.475222682444944e-06, + "loss": 0.4979, + "num_input_tokens_seen": 5124520, + "step": 7885 + }, + { + "epoch": 4.652122641509434, + "grad_norm": 1.2228881120681763, + "learning_rate": 9.474074719354406e-06, + "loss": 0.4655, + "num_input_tokens_seen": 5127912, + "step": 7890 + }, + { + "epoch": 4.655070754716981, + "grad_norm": 1.836904764175415, + "learning_rate": 9.472925571733306e-06, + "loss": 0.4555, + "num_input_tokens_seen": 5130824, + "step": 7895 + }, + { + "epoch": 4.658018867924528, + "grad_norm": 1.9640544652938843, + "learning_rate": 9.471775239885883e-06, + "loss": 0.4919, + "num_input_tokens_seen": 5134024, + "step": 7900 + }, + { + "epoch": 4.660966981132075, + "grad_norm": 2.0581021308898926, + "learning_rate": 9.470623724116693e-06, + "loss": 0.4143, + "num_input_tokens_seen": 5137672, + "step": 7905 + }, + { + "epoch": 4.663915094339623, + "grad_norm": 1.1155809164047241, + "learning_rate": 9.469471024730606e-06, + "loss": 0.4364, + "num_input_tokens_seen": 5141480, + "step": 7910 + }, + { + "epoch": 4.66686320754717, + "grad_norm": 1.1886358261108398, + "learning_rate": 9.4683171420328e-06, + "loss": 0.5073, + "num_input_tokens_seen": 5145800, + "step": 7915 + }, + { + "epoch": 4.669811320754717, + "grad_norm": 2.5233659744262695, + "learning_rate": 9.467162076328776e-06, + "loss": 0.4013, + "num_input_tokens_seen": 5149320, + "step": 7920 + }, + { + "epoch": 4.6727594339622645, + "grad_norm": 3.3065855503082275, + "learning_rate": 9.466005827924337e-06, + "loss": 0.5339, + "num_input_tokens_seen": 5152072, + "step": 7925 + }, + { + "epoch": 4.675707547169811, + "grad_norm": 1.9489041566848755, + "learning_rate": 9.46484839712561e-06, + "loss": 0.6446, + "num_input_tokens_seen": 5155304, + "step": 7930 + }, + { + "epoch": 4.678655660377358, + "grad_norm": 4.527299880981445, + "learning_rate": 9.463689784239026e-06, + "loss": 0.4136, + "num_input_tokens_seen": 5158216, + "step": 7935 + }, + { + "epoch": 4.681603773584905, + "grad_norm": 2.0233635902404785, + "learning_rate": 9.462529989571334e-06, + "loss": 0.4616, + "num_input_tokens_seen": 5161896, + "step": 7940 + }, + { + "epoch": 4.684551886792453, + "grad_norm": 1.519830584526062, + "learning_rate": 9.461369013429595e-06, + "loss": 0.5737, + "num_input_tokens_seen": 5167880, + "step": 7945 + }, + { + "epoch": 4.6875, + "grad_norm": 2.645507335662842, + "learning_rate": 9.460206856121183e-06, + "loss": 0.5744, + "num_input_tokens_seen": 5171112, + "step": 7950 + }, + { + "epoch": 4.690448113207547, + "grad_norm": 1.7675485610961914, + "learning_rate": 9.459043517953786e-06, + "loss": 0.4463, + "num_input_tokens_seen": 5174920, + "step": 7955 + }, + { + "epoch": 4.693396226415095, + "grad_norm": 1.7541604042053223, + "learning_rate": 9.457878999235396e-06, + "loss": 0.4516, + "num_input_tokens_seen": 5178088, + "step": 7960 + }, + { + "epoch": 4.696344339622642, + "grad_norm": 1.7906488180160522, + "learning_rate": 9.45671330027433e-06, + "loss": 0.6278, + "num_input_tokens_seen": 5181448, + "step": 7965 + }, + { + "epoch": 4.699292452830189, + "grad_norm": 1.631229043006897, + "learning_rate": 9.45554642137921e-06, + "loss": 0.4615, + "num_input_tokens_seen": 5183976, + "step": 7970 + }, + { + "epoch": 4.7022405660377355, + "grad_norm": 1.7123756408691406, + "learning_rate": 9.454378362858974e-06, + "loss": 0.37, + "num_input_tokens_seen": 5187688, + "step": 7975 + }, + { + "epoch": 4.705188679245283, + "grad_norm": 1.5067150592803955, + "learning_rate": 9.453209125022867e-06, + "loss": 0.6165, + "num_input_tokens_seen": 5190504, + "step": 7980 + }, + { + "epoch": 4.70813679245283, + "grad_norm": 1.6526689529418945, + "learning_rate": 9.452038708180453e-06, + "loss": 0.4526, + "num_input_tokens_seen": 5193032, + "step": 7985 + }, + { + "epoch": 4.711084905660377, + "grad_norm": 2.604750633239746, + "learning_rate": 9.450867112641603e-06, + "loss": 0.4659, + "num_input_tokens_seen": 5196968, + "step": 7990 + }, + { + "epoch": 4.714033018867925, + "grad_norm": 2.1780645847320557, + "learning_rate": 9.449694338716506e-06, + "loss": 0.4777, + "num_input_tokens_seen": 5199848, + "step": 7995 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 1.6224945783615112, + "learning_rate": 9.448520386715653e-06, + "loss": 0.4135, + "num_input_tokens_seen": 5204040, + "step": 8000 + }, + { + "epoch": 4.719929245283019, + "grad_norm": 1.8815759420394897, + "learning_rate": 9.447345256949855e-06, + "loss": 0.3828, + "num_input_tokens_seen": 5207240, + "step": 8005 + }, + { + "epoch": 4.722877358490566, + "grad_norm": 1.3700987100601196, + "learning_rate": 9.446168949730234e-06, + "loss": 0.5591, + "num_input_tokens_seen": 5210312, + "step": 8010 + }, + { + "epoch": 4.725825471698113, + "grad_norm": 1.7998849153518677, + "learning_rate": 9.444991465368223e-06, + "loss": 0.4712, + "num_input_tokens_seen": 5214760, + "step": 8015 + }, + { + "epoch": 4.72877358490566, + "grad_norm": 1.5192891359329224, + "learning_rate": 9.443812804175562e-06, + "loss": 0.5624, + "num_input_tokens_seen": 5217960, + "step": 8020 + }, + { + "epoch": 4.7317216981132075, + "grad_norm": 1.0249063968658447, + "learning_rate": 9.44263296646431e-06, + "loss": 0.4757, + "num_input_tokens_seen": 5221576, + "step": 8025 + }, + { + "epoch": 4.734669811320755, + "grad_norm": 2.260812520980835, + "learning_rate": 9.441451952546835e-06, + "loss": 0.5286, + "num_input_tokens_seen": 5225160, + "step": 8030 + }, + { + "epoch": 4.737617924528302, + "grad_norm": 2.1413991451263428, + "learning_rate": 9.440269762735814e-06, + "loss": 0.456, + "num_input_tokens_seen": 5228072, + "step": 8035 + }, + { + "epoch": 4.740566037735849, + "grad_norm": 2.1210973262786865, + "learning_rate": 9.439086397344236e-06, + "loss": 0.4229, + "num_input_tokens_seen": 5231112, + "step": 8040 + }, + { + "epoch": 4.743514150943396, + "grad_norm": 0.9826104640960693, + "learning_rate": 9.437901856685404e-06, + "loss": 0.3379, + "num_input_tokens_seen": 5234440, + "step": 8045 + }, + { + "epoch": 4.746462264150943, + "grad_norm": 1.7591389417648315, + "learning_rate": 9.436716141072925e-06, + "loss": 0.3924, + "num_input_tokens_seen": 5237960, + "step": 8050 + }, + { + "epoch": 4.74941037735849, + "grad_norm": 2.459364414215088, + "learning_rate": 9.435529250820732e-06, + "loss": 0.3906, + "num_input_tokens_seen": 5240616, + "step": 8055 + }, + { + "epoch": 4.752358490566038, + "grad_norm": 1.8316712379455566, + "learning_rate": 9.43434118624305e-06, + "loss": 0.3462, + "num_input_tokens_seen": 5243784, + "step": 8060 + }, + { + "epoch": 4.755306603773585, + "grad_norm": 1.7045954465866089, + "learning_rate": 9.433151947654428e-06, + "loss": 0.4659, + "num_input_tokens_seen": 5247016, + "step": 8065 + }, + { + "epoch": 4.758254716981132, + "grad_norm": 1.8809270858764648, + "learning_rate": 9.431961535369724e-06, + "loss": 0.6033, + "num_input_tokens_seen": 5250504, + "step": 8070 + }, + { + "epoch": 4.7612028301886795, + "grad_norm": 1.4781222343444824, + "learning_rate": 9.430769949704103e-06, + "loss": 0.5665, + "num_input_tokens_seen": 5254056, + "step": 8075 + }, + { + "epoch": 4.764150943396227, + "grad_norm": 2.366100311279297, + "learning_rate": 9.42957719097304e-06, + "loss": 0.328, + "num_input_tokens_seen": 5257896, + "step": 8080 + }, + { + "epoch": 4.767099056603773, + "grad_norm": 1.9611051082611084, + "learning_rate": 9.42838325949233e-06, + "loss": 0.4381, + "num_input_tokens_seen": 5261544, + "step": 8085 + }, + { + "epoch": 4.7700471698113205, + "grad_norm": 1.8819278478622437, + "learning_rate": 9.427188155578062e-06, + "loss": 0.5191, + "num_input_tokens_seen": 5265352, + "step": 8090 + }, + { + "epoch": 4.772995283018868, + "grad_norm": 3.05875563621521, + "learning_rate": 9.42599187954665e-06, + "loss": 0.5181, + "num_input_tokens_seen": 5268648, + "step": 8095 + }, + { + "epoch": 4.775943396226415, + "grad_norm": 2.150609254837036, + "learning_rate": 9.424794431714814e-06, + "loss": 0.5463, + "num_input_tokens_seen": 5272200, + "step": 8100 + }, + { + "epoch": 4.778891509433962, + "grad_norm": 2.5011281967163086, + "learning_rate": 9.423595812399581e-06, + "loss": 0.5231, + "num_input_tokens_seen": 5275048, + "step": 8105 + }, + { + "epoch": 4.78183962264151, + "grad_norm": 1.171621322631836, + "learning_rate": 9.422396021918296e-06, + "loss": 0.5194, + "num_input_tokens_seen": 5278600, + "step": 8110 + }, + { + "epoch": 4.784787735849057, + "grad_norm": 2.025660991668701, + "learning_rate": 9.421195060588602e-06, + "loss": 0.5531, + "num_input_tokens_seen": 5282056, + "step": 8115 + }, + { + "epoch": 4.787735849056604, + "grad_norm": 1.4467459917068481, + "learning_rate": 9.419992928728461e-06, + "loss": 0.3538, + "num_input_tokens_seen": 5284808, + "step": 8120 + }, + { + "epoch": 4.790683962264151, + "grad_norm": 2.263584852218628, + "learning_rate": 9.418789626656144e-06, + "loss": 0.4442, + "num_input_tokens_seen": 5287880, + "step": 8125 + }, + { + "epoch": 4.793632075471698, + "grad_norm": 1.2460007667541504, + "learning_rate": 9.417585154690229e-06, + "loss": 0.3562, + "num_input_tokens_seen": 5290728, + "step": 8130 + }, + { + "epoch": 4.796580188679245, + "grad_norm": 1.7463536262512207, + "learning_rate": 9.416379513149605e-06, + "loss": 0.3286, + "num_input_tokens_seen": 5293640, + "step": 8135 + }, + { + "epoch": 4.7995283018867925, + "grad_norm": 1.94441556930542, + "learning_rate": 9.415172702353471e-06, + "loss": 0.4564, + "num_input_tokens_seen": 5297768, + "step": 8140 + }, + { + "epoch": 4.80247641509434, + "grad_norm": 2.4404234886169434, + "learning_rate": 9.413964722621339e-06, + "loss": 0.4186, + "num_input_tokens_seen": 5300680, + "step": 8145 + }, + { + "epoch": 4.805424528301887, + "grad_norm": 1.5452872514724731, + "learning_rate": 9.41275557427302e-06, + "loss": 0.5172, + "num_input_tokens_seen": 5303880, + "step": 8150 + }, + { + "epoch": 4.808372641509434, + "grad_norm": 2.145986795425415, + "learning_rate": 9.411545257628646e-06, + "loss": 0.4725, + "num_input_tokens_seen": 5307048, + "step": 8155 + }, + { + "epoch": 4.811320754716981, + "grad_norm": 2.5573008060455322, + "learning_rate": 9.410333773008653e-06, + "loss": 0.3675, + "num_input_tokens_seen": 5310088, + "step": 8160 + }, + { + "epoch": 4.814268867924528, + "grad_norm": 2.309384822845459, + "learning_rate": 9.409121120733784e-06, + "loss": 0.4108, + "num_input_tokens_seen": 5313320, + "step": 8165 + }, + { + "epoch": 4.817216981132075, + "grad_norm": 1.8320255279541016, + "learning_rate": 9.4079073011251e-06, + "loss": 0.4226, + "num_input_tokens_seen": 5317480, + "step": 8170 + }, + { + "epoch": 4.820165094339623, + "grad_norm": 2.182582139968872, + "learning_rate": 9.406692314503956e-06, + "loss": 0.5109, + "num_input_tokens_seen": 5320168, + "step": 8175 + }, + { + "epoch": 4.82311320754717, + "grad_norm": 1.430658221244812, + "learning_rate": 9.405476161192033e-06, + "loss": 0.3242, + "num_input_tokens_seen": 5326472, + "step": 8180 + }, + { + "epoch": 4.826061320754717, + "grad_norm": 1.8482496738433838, + "learning_rate": 9.40425884151131e-06, + "loss": 0.4166, + "num_input_tokens_seen": 5330728, + "step": 8185 + }, + { + "epoch": 4.8290094339622645, + "grad_norm": 5.08083438873291, + "learning_rate": 9.403040355784076e-06, + "loss": 0.5636, + "num_input_tokens_seen": 5333448, + "step": 8190 + }, + { + "epoch": 4.831957547169811, + "grad_norm": 1.3805896043777466, + "learning_rate": 9.401820704332932e-06, + "loss": 0.4205, + "num_input_tokens_seen": 5336360, + "step": 8195 + }, + { + "epoch": 4.834905660377358, + "grad_norm": 1.5043219327926636, + "learning_rate": 9.400599887480786e-06, + "loss": 0.4176, + "num_input_tokens_seen": 5339560, + "step": 8200 + }, + { + "epoch": 4.837853773584905, + "grad_norm": 3.3727777004241943, + "learning_rate": 9.399377905550854e-06, + "loss": 0.6162, + "num_input_tokens_seen": 5342344, + "step": 8205 + }, + { + "epoch": 4.840801886792453, + "grad_norm": 1.4693987369537354, + "learning_rate": 9.398154758866662e-06, + "loss": 0.3588, + "num_input_tokens_seen": 5346120, + "step": 8210 + }, + { + "epoch": 4.84375, + "grad_norm": 2.595595121383667, + "learning_rate": 9.396930447752041e-06, + "loss": 0.4534, + "num_input_tokens_seen": 5352616, + "step": 8215 + }, + { + "epoch": 4.846698113207547, + "grad_norm": 1.3976528644561768, + "learning_rate": 9.395704972531137e-06, + "loss": 0.689, + "num_input_tokens_seen": 5355464, + "step": 8220 + }, + { + "epoch": 4.849646226415095, + "grad_norm": 1.3760422468185425, + "learning_rate": 9.394478333528396e-06, + "loss": 0.4442, + "num_input_tokens_seen": 5358504, + "step": 8225 + }, + { + "epoch": 4.852594339622642, + "grad_norm": 2.2249667644500732, + "learning_rate": 9.393250531068576e-06, + "loss": 0.364, + "num_input_tokens_seen": 5360648, + "step": 8230 + }, + { + "epoch": 4.855542452830189, + "grad_norm": 2.19929575920105, + "learning_rate": 9.392021565476744e-06, + "loss": 0.3551, + "num_input_tokens_seen": 5366024, + "step": 8235 + }, + { + "epoch": 4.8584905660377355, + "grad_norm": 1.030867099761963, + "learning_rate": 9.390791437078274e-06, + "loss": 0.3379, + "num_input_tokens_seen": 5369352, + "step": 8240 + }, + { + "epoch": 4.861438679245283, + "grad_norm": 1.9071762561798096, + "learning_rate": 9.38956014619885e-06, + "loss": 0.4944, + "num_input_tokens_seen": 5372104, + "step": 8245 + }, + { + "epoch": 4.86438679245283, + "grad_norm": 2.5750532150268555, + "learning_rate": 9.388327693164456e-06, + "loss": 0.6616, + "num_input_tokens_seen": 5374728, + "step": 8250 + }, + { + "epoch": 4.867334905660377, + "grad_norm": 4.7101826667785645, + "learning_rate": 9.387094078301395e-06, + "loss": 0.4208, + "num_input_tokens_seen": 5380328, + "step": 8255 + }, + { + "epoch": 4.870283018867925, + "grad_norm": 2.9712626934051514, + "learning_rate": 9.385859301936269e-06, + "loss": 0.3802, + "num_input_tokens_seen": 5384488, + "step": 8260 + }, + { + "epoch": 4.873231132075472, + "grad_norm": 3.576486349105835, + "learning_rate": 9.38462336439599e-06, + "loss": 0.3822, + "num_input_tokens_seen": 5386888, + "step": 8265 + }, + { + "epoch": 4.876179245283019, + "grad_norm": 4.8146443367004395, + "learning_rate": 9.383386266007779e-06, + "loss": 0.5905, + "num_input_tokens_seen": 5389576, + "step": 8270 + }, + { + "epoch": 4.879127358490566, + "grad_norm": 1.5878015756607056, + "learning_rate": 9.382148007099164e-06, + "loss": 0.4105, + "num_input_tokens_seen": 5392744, + "step": 8275 + }, + { + "epoch": 4.882075471698113, + "grad_norm": 1.122544288635254, + "learning_rate": 9.380908587997977e-06, + "loss": 0.3934, + "num_input_tokens_seen": 5396328, + "step": 8280 + }, + { + "epoch": 4.88502358490566, + "grad_norm": 1.813158392906189, + "learning_rate": 9.37966800903236e-06, + "loss": 0.3989, + "num_input_tokens_seen": 5399496, + "step": 8285 + }, + { + "epoch": 4.8879716981132075, + "grad_norm": 1.7239893674850464, + "learning_rate": 9.378426270530762e-06, + "loss": 0.4764, + "num_input_tokens_seen": 5402312, + "step": 8290 + }, + { + "epoch": 4.890919811320755, + "grad_norm": 3.5756137371063232, + "learning_rate": 9.37718337282194e-06, + "loss": 0.4632, + "num_input_tokens_seen": 5405928, + "step": 8295 + }, + { + "epoch": 4.893867924528302, + "grad_norm": 1.8245495557785034, + "learning_rate": 9.375939316234956e-06, + "loss": 0.4719, + "num_input_tokens_seen": 5409608, + "step": 8300 + }, + { + "epoch": 4.896816037735849, + "grad_norm": 1.3866806030273438, + "learning_rate": 9.374694101099178e-06, + "loss": 0.4291, + "num_input_tokens_seen": 5412744, + "step": 8305 + }, + { + "epoch": 4.899764150943396, + "grad_norm": 2.032897710800171, + "learning_rate": 9.373447727744282e-06, + "loss": 0.4423, + "num_input_tokens_seen": 5417512, + "step": 8310 + }, + { + "epoch": 4.902712264150943, + "grad_norm": 1.8219990730285645, + "learning_rate": 9.372200196500253e-06, + "loss": 0.4771, + "num_input_tokens_seen": 5420648, + "step": 8315 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 1.8803461790084839, + "learning_rate": 9.37095150769738e-06, + "loss": 0.3742, + "num_input_tokens_seen": 5423016, + "step": 8320 + }, + { + "epoch": 4.908608490566038, + "grad_norm": 3.38843035697937, + "learning_rate": 9.369701661666255e-06, + "loss": 0.4001, + "num_input_tokens_seen": 5425544, + "step": 8325 + }, + { + "epoch": 4.911556603773585, + "grad_norm": 1.718065857887268, + "learning_rate": 9.368450658737782e-06, + "loss": 0.4381, + "num_input_tokens_seen": 5429032, + "step": 8330 + }, + { + "epoch": 4.914504716981132, + "grad_norm": 1.6822131872177124, + "learning_rate": 9.367198499243173e-06, + "loss": 0.5225, + "num_input_tokens_seen": 5432040, + "step": 8335 + }, + { + "epoch": 4.9174528301886795, + "grad_norm": 1.7496016025543213, + "learning_rate": 9.365945183513938e-06, + "loss": 0.4484, + "num_input_tokens_seen": 5435240, + "step": 8340 + }, + { + "epoch": 4.920400943396227, + "grad_norm": 1.6040019989013672, + "learning_rate": 9.3646907118819e-06, + "loss": 0.474, + "num_input_tokens_seen": 5438952, + "step": 8345 + }, + { + "epoch": 4.923349056603773, + "grad_norm": 2.2996151447296143, + "learning_rate": 9.363435084679185e-06, + "loss": 0.4824, + "num_input_tokens_seen": 5441672, + "step": 8350 + }, + { + "epoch": 4.9262971698113205, + "grad_norm": 2.250028610229492, + "learning_rate": 9.362178302238227e-06, + "loss": 0.4053, + "num_input_tokens_seen": 5444488, + "step": 8355 + }, + { + "epoch": 4.929245283018868, + "grad_norm": 3.3839643001556396, + "learning_rate": 9.360920364891762e-06, + "loss": 0.4813, + "num_input_tokens_seen": 5447400, + "step": 8360 + }, + { + "epoch": 4.932193396226415, + "grad_norm": 2.048722743988037, + "learning_rate": 9.359661272972836e-06, + "loss": 0.398, + "num_input_tokens_seen": 5451112, + "step": 8365 + }, + { + "epoch": 4.935141509433962, + "grad_norm": 1.8574259281158447, + "learning_rate": 9.3584010268148e-06, + "loss": 0.4676, + "num_input_tokens_seen": 5454376, + "step": 8370 + }, + { + "epoch": 4.93808962264151, + "grad_norm": 1.2148648500442505, + "learning_rate": 9.357139626751308e-06, + "loss": 0.4263, + "num_input_tokens_seen": 5457896, + "step": 8375 + }, + { + "epoch": 4.941037735849057, + "grad_norm": 1.3089103698730469, + "learning_rate": 9.355877073116321e-06, + "loss": 0.4814, + "num_input_tokens_seen": 5460584, + "step": 8380 + }, + { + "epoch": 4.943985849056604, + "grad_norm": 2.2912652492523193, + "learning_rate": 9.354613366244108e-06, + "loss": 0.4842, + "num_input_tokens_seen": 5463368, + "step": 8385 + }, + { + "epoch": 4.946933962264151, + "grad_norm": 2.569997549057007, + "learning_rate": 9.353348506469236e-06, + "loss": 0.6057, + "num_input_tokens_seen": 5467720, + "step": 8390 + }, + { + "epoch": 4.949882075471698, + "grad_norm": 2.156742811203003, + "learning_rate": 9.352082494126586e-06, + "loss": 0.4386, + "num_input_tokens_seen": 5470792, + "step": 8395 + }, + { + "epoch": 4.952830188679245, + "grad_norm": 3.5158495903015137, + "learning_rate": 9.350815329551341e-06, + "loss": 0.4343, + "num_input_tokens_seen": 5474504, + "step": 8400 + }, + { + "epoch": 4.9557783018867925, + "grad_norm": 2.7224245071411133, + "learning_rate": 9.349547013078986e-06, + "loss": 0.4271, + "num_input_tokens_seen": 5478536, + "step": 8405 + }, + { + "epoch": 4.95872641509434, + "grad_norm": 4.086310386657715, + "learning_rate": 9.348277545045312e-06, + "loss": 0.5504, + "num_input_tokens_seen": 5480936, + "step": 8410 + }, + { + "epoch": 4.961674528301887, + "grad_norm": 1.789497971534729, + "learning_rate": 9.347006925786418e-06, + "loss": 0.5185, + "num_input_tokens_seen": 5484200, + "step": 8415 + }, + { + "epoch": 4.964622641509434, + "grad_norm": 4.406622409820557, + "learning_rate": 9.34573515563871e-06, + "loss": 0.4222, + "num_input_tokens_seen": 5487496, + "step": 8420 + }, + { + "epoch": 4.967570754716981, + "grad_norm": 2.5554046630859375, + "learning_rate": 9.344462234938885e-06, + "loss": 0.3243, + "num_input_tokens_seen": 5489864, + "step": 8425 + }, + { + "epoch": 4.970518867924528, + "grad_norm": 1.692103624343872, + "learning_rate": 9.343188164023962e-06, + "loss": 0.5222, + "num_input_tokens_seen": 5493704, + "step": 8430 + }, + { + "epoch": 4.973466981132075, + "grad_norm": 2.1611404418945312, + "learning_rate": 9.341912943231256e-06, + "loss": 0.4547, + "num_input_tokens_seen": 5499560, + "step": 8435 + }, + { + "epoch": 4.976415094339623, + "grad_norm": 3.4097931385040283, + "learning_rate": 9.340636572898383e-06, + "loss": 0.648, + "num_input_tokens_seen": 5501992, + "step": 8440 + }, + { + "epoch": 4.97936320754717, + "grad_norm": 1.1731622219085693, + "learning_rate": 9.339359053363272e-06, + "loss": 0.3866, + "num_input_tokens_seen": 5505352, + "step": 8445 + }, + { + "epoch": 4.982311320754717, + "grad_norm": 2.038130044937134, + "learning_rate": 9.338080384964148e-06, + "loss": 0.5358, + "num_input_tokens_seen": 5508232, + "step": 8450 + }, + { + "epoch": 4.9852594339622645, + "grad_norm": 1.9956356287002563, + "learning_rate": 9.336800568039548e-06, + "loss": 0.4299, + "num_input_tokens_seen": 5512872, + "step": 8455 + }, + { + "epoch": 4.988207547169811, + "grad_norm": 1.1257824897766113, + "learning_rate": 9.335519602928307e-06, + "loss": 0.5236, + "num_input_tokens_seen": 5516584, + "step": 8460 + }, + { + "epoch": 4.991155660377358, + "grad_norm": 1.1254146099090576, + "learning_rate": 9.334237489969565e-06, + "loss": 0.4382, + "num_input_tokens_seen": 5520392, + "step": 8465 + }, + { + "epoch": 4.994103773584905, + "grad_norm": 1.9410003423690796, + "learning_rate": 9.332954229502768e-06, + "loss": 0.4476, + "num_input_tokens_seen": 5523528, + "step": 8470 + }, + { + "epoch": 4.997051886792453, + "grad_norm": 1.9509012699127197, + "learning_rate": 9.331669821867665e-06, + "loss": 0.3976, + "num_input_tokens_seen": 5526216, + "step": 8475 + }, + { + "epoch": 5.0, + "grad_norm": 4.298738956451416, + "learning_rate": 9.33038426740431e-06, + "loss": 0.4917, + "num_input_tokens_seen": 5528680, + "step": 8480 + }, + { + "epoch": 5.002948113207547, + "grad_norm": 2.41084885597229, + "learning_rate": 9.329097566453055e-06, + "loss": 0.4527, + "num_input_tokens_seen": 5531464, + "step": 8485 + }, + { + "epoch": 5.005896226415095, + "grad_norm": 0.9695172905921936, + "learning_rate": 9.327809719354564e-06, + "loss": 0.5777, + "num_input_tokens_seen": 5534216, + "step": 8490 + }, + { + "epoch": 5.008844339622642, + "grad_norm": 2.997445821762085, + "learning_rate": 9.326520726449795e-06, + "loss": 0.6475, + "num_input_tokens_seen": 5537224, + "step": 8495 + }, + { + "epoch": 5.011792452830188, + "grad_norm": 1.2498568296432495, + "learning_rate": 9.32523058808002e-06, + "loss": 0.4129, + "num_input_tokens_seen": 5540904, + "step": 8500 + }, + { + "epoch": 5.0147405660377355, + "grad_norm": 2.137699604034424, + "learning_rate": 9.323939304586806e-06, + "loss": 0.5609, + "num_input_tokens_seen": 5544328, + "step": 8505 + }, + { + "epoch": 5.017688679245283, + "grad_norm": 1.3748759031295776, + "learning_rate": 9.322646876312025e-06, + "loss": 0.3596, + "num_input_tokens_seen": 5550984, + "step": 8510 + }, + { + "epoch": 5.02063679245283, + "grad_norm": 2.105956792831421, + "learning_rate": 9.321353303597854e-06, + "loss": 0.375, + "num_input_tokens_seen": 5555240, + "step": 8515 + }, + { + "epoch": 5.023584905660377, + "grad_norm": 2.305466890335083, + "learning_rate": 9.320058586786771e-06, + "loss": 0.4131, + "num_input_tokens_seen": 5559208, + "step": 8520 + }, + { + "epoch": 5.026533018867925, + "grad_norm": 1.2828630208969116, + "learning_rate": 9.31876272622156e-06, + "loss": 0.3864, + "num_input_tokens_seen": 5562824, + "step": 8525 + }, + { + "epoch": 5.029481132075472, + "grad_norm": 1.771759271621704, + "learning_rate": 9.317465722245305e-06, + "loss": 0.4407, + "num_input_tokens_seen": 5566216, + "step": 8530 + }, + { + "epoch": 5.032429245283019, + "grad_norm": 0.84317547082901, + "learning_rate": 9.316167575201391e-06, + "loss": 0.3771, + "num_input_tokens_seen": 5570696, + "step": 8535 + }, + { + "epoch": 5.035377358490566, + "grad_norm": 1.3762197494506836, + "learning_rate": 9.31486828543351e-06, + "loss": 0.5153, + "num_input_tokens_seen": 5574248, + "step": 8540 + }, + { + "epoch": 5.038325471698113, + "grad_norm": 2.493542432785034, + "learning_rate": 9.313567853285656e-06, + "loss": 0.3103, + "num_input_tokens_seen": 5577256, + "step": 8545 + }, + { + "epoch": 5.04127358490566, + "grad_norm": 1.4198964834213257, + "learning_rate": 9.31226627910212e-06, + "loss": 0.4426, + "num_input_tokens_seen": 5580872, + "step": 8550 + }, + { + "epoch": 5.0442216981132075, + "grad_norm": 1.9185653924942017, + "learning_rate": 9.310963563227504e-06, + "loss": 0.4514, + "num_input_tokens_seen": 5585256, + "step": 8555 + }, + { + "epoch": 5.047169811320755, + "grad_norm": 1.3696937561035156, + "learning_rate": 9.309659706006704e-06, + "loss": 0.4937, + "num_input_tokens_seen": 5587592, + "step": 8560 + }, + { + "epoch": 5.050117924528302, + "grad_norm": 2.1512253284454346, + "learning_rate": 9.308354707784925e-06, + "loss": 0.3876, + "num_input_tokens_seen": 5591464, + "step": 8565 + }, + { + "epoch": 5.053066037735849, + "grad_norm": 4.915706634521484, + "learning_rate": 9.307048568907669e-06, + "loss": 0.4688, + "num_input_tokens_seen": 5594344, + "step": 8570 + }, + { + "epoch": 5.056014150943396, + "grad_norm": 2.2504372596740723, + "learning_rate": 9.30574128972074e-06, + "loss": 0.42, + "num_input_tokens_seen": 5597192, + "step": 8575 + }, + { + "epoch": 5.058962264150943, + "grad_norm": 1.4182058572769165, + "learning_rate": 9.304432870570247e-06, + "loss": 0.4025, + "num_input_tokens_seen": 5599784, + "step": 8580 + }, + { + "epoch": 5.06191037735849, + "grad_norm": 1.2327818870544434, + "learning_rate": 9.303123311802605e-06, + "loss": 0.4048, + "num_input_tokens_seen": 5602792, + "step": 8585 + }, + { + "epoch": 5.064858490566038, + "grad_norm": 3.1173484325408936, + "learning_rate": 9.301812613764516e-06, + "loss": 0.4445, + "num_input_tokens_seen": 5605896, + "step": 8590 + }, + { + "epoch": 5.067806603773585, + "grad_norm": 2.4708333015441895, + "learning_rate": 9.300500776803001e-06, + "loss": 0.4476, + "num_input_tokens_seen": 5608712, + "step": 8595 + }, + { + "epoch": 5.070754716981132, + "grad_norm": 1.5063966512680054, + "learning_rate": 9.29918780126537e-06, + "loss": 0.4131, + "num_input_tokens_seen": 5611432, + "step": 8600 + }, + { + "epoch": 5.0737028301886795, + "grad_norm": 1.5420846939086914, + "learning_rate": 9.297873687499239e-06, + "loss": 0.4427, + "num_input_tokens_seen": 5614440, + "step": 8605 + }, + { + "epoch": 5.076650943396227, + "grad_norm": 1.980177879333496, + "learning_rate": 9.296558435852528e-06, + "loss": 0.3489, + "num_input_tokens_seen": 5617032, + "step": 8610 + }, + { + "epoch": 5.079599056603773, + "grad_norm": 2.1566474437713623, + "learning_rate": 9.295242046673454e-06, + "loss": 0.4595, + "num_input_tokens_seen": 5620072, + "step": 8615 + }, + { + "epoch": 5.0825471698113205, + "grad_norm": 1.7096978425979614, + "learning_rate": 9.293924520310535e-06, + "loss": 0.501, + "num_input_tokens_seen": 5623304, + "step": 8620 + }, + { + "epoch": 5.085495283018868, + "grad_norm": 1.129740834236145, + "learning_rate": 9.292605857112595e-06, + "loss": 0.4741, + "num_input_tokens_seen": 5627080, + "step": 8625 + }, + { + "epoch": 5.088443396226415, + "grad_norm": 3.421769380569458, + "learning_rate": 9.291286057428755e-06, + "loss": 0.4497, + "num_input_tokens_seen": 5630568, + "step": 8630 + }, + { + "epoch": 5.091391509433962, + "grad_norm": 1.4722706079483032, + "learning_rate": 9.289965121608436e-06, + "loss": 0.2771, + "num_input_tokens_seen": 5633864, + "step": 8635 + }, + { + "epoch": 5.09433962264151, + "grad_norm": 1.4053847789764404, + "learning_rate": 9.288643050001362e-06, + "loss": 0.4199, + "num_input_tokens_seen": 5636552, + "step": 8640 + }, + { + "epoch": 5.097287735849057, + "grad_norm": 1.5366727113723755, + "learning_rate": 9.287319842957557e-06, + "loss": 0.5116, + "num_input_tokens_seen": 5639304, + "step": 8645 + }, + { + "epoch": 5.100235849056604, + "grad_norm": 2.2252590656280518, + "learning_rate": 9.285995500827348e-06, + "loss": 0.4196, + "num_input_tokens_seen": 5641960, + "step": 8650 + }, + { + "epoch": 5.103183962264151, + "grad_norm": 4.010032653808594, + "learning_rate": 9.284670023961355e-06, + "loss": 0.5389, + "num_input_tokens_seen": 5645928, + "step": 8655 + }, + { + "epoch": 5.106132075471698, + "grad_norm": 2.2784535884857178, + "learning_rate": 9.28334341271051e-06, + "loss": 0.5125, + "num_input_tokens_seen": 5649160, + "step": 8660 + }, + { + "epoch": 5.109080188679245, + "grad_norm": 3.3265559673309326, + "learning_rate": 9.282015667426036e-06, + "loss": 0.4747, + "num_input_tokens_seen": 5651816, + "step": 8665 + }, + { + "epoch": 5.1120283018867925, + "grad_norm": 1.9093157052993774, + "learning_rate": 9.280686788459461e-06, + "loss": 0.5281, + "num_input_tokens_seen": 5656328, + "step": 8670 + }, + { + "epoch": 5.11497641509434, + "grad_norm": 0.9107136726379395, + "learning_rate": 9.279356776162606e-06, + "loss": 0.4091, + "num_input_tokens_seen": 5660008, + "step": 8675 + }, + { + "epoch": 5.117924528301887, + "grad_norm": 1.8482780456542969, + "learning_rate": 9.278025630887607e-06, + "loss": 0.4998, + "num_input_tokens_seen": 5662696, + "step": 8680 + }, + { + "epoch": 5.120872641509434, + "grad_norm": 3.0636982917785645, + "learning_rate": 9.27669335298688e-06, + "loss": 0.3865, + "num_input_tokens_seen": 5666216, + "step": 8685 + }, + { + "epoch": 5.123820754716981, + "grad_norm": 2.0669620037078857, + "learning_rate": 9.275359942813158e-06, + "loss": 0.4275, + "num_input_tokens_seen": 5669128, + "step": 8690 + }, + { + "epoch": 5.126768867924528, + "grad_norm": 1.4080373048782349, + "learning_rate": 9.274025400719466e-06, + "loss": 0.4193, + "num_input_tokens_seen": 5672168, + "step": 8695 + }, + { + "epoch": 5.129716981132075, + "grad_norm": 4.376392364501953, + "learning_rate": 9.27268972705913e-06, + "loss": 0.4861, + "num_input_tokens_seen": 5675528, + "step": 8700 + }, + { + "epoch": 5.132665094339623, + "grad_norm": 1.2970664501190186, + "learning_rate": 9.271352922185772e-06, + "loss": 0.5187, + "num_input_tokens_seen": 5680072, + "step": 8705 + }, + { + "epoch": 5.13561320754717, + "grad_norm": 1.3828120231628418, + "learning_rate": 9.270014986453321e-06, + "loss": 0.4098, + "num_input_tokens_seen": 5683688, + "step": 8710 + }, + { + "epoch": 5.138561320754717, + "grad_norm": 2.581124782562256, + "learning_rate": 9.268675920215999e-06, + "loss": 0.5463, + "num_input_tokens_seen": 5686600, + "step": 8715 + }, + { + "epoch": 5.1415094339622645, + "grad_norm": 1.3863767385482788, + "learning_rate": 9.26733572382833e-06, + "loss": 0.4909, + "num_input_tokens_seen": 5690376, + "step": 8720 + }, + { + "epoch": 5.144457547169812, + "grad_norm": 2.3836750984191895, + "learning_rate": 9.265994397645137e-06, + "loss": 0.462, + "num_input_tokens_seen": 5692840, + "step": 8725 + }, + { + "epoch": 5.147405660377358, + "grad_norm": 2.159472942352295, + "learning_rate": 9.264651942021543e-06, + "loss": 0.5319, + "num_input_tokens_seen": 5697032, + "step": 8730 + }, + { + "epoch": 5.150353773584905, + "grad_norm": 2.1057000160217285, + "learning_rate": 9.263308357312966e-06, + "loss": 0.4488, + "num_input_tokens_seen": 5700552, + "step": 8735 + }, + { + "epoch": 5.153301886792453, + "grad_norm": 1.8502203226089478, + "learning_rate": 9.26196364387513e-06, + "loss": 0.4837, + "num_input_tokens_seen": 5705288, + "step": 8740 + }, + { + "epoch": 5.15625, + "grad_norm": 1.8362536430358887, + "learning_rate": 9.26061780206405e-06, + "loss": 0.4218, + "num_input_tokens_seen": 5708008, + "step": 8745 + }, + { + "epoch": 5.159198113207547, + "grad_norm": 2.5595719814300537, + "learning_rate": 9.259270832236043e-06, + "loss": 0.454, + "num_input_tokens_seen": 5711240, + "step": 8750 + }, + { + "epoch": 5.162146226415095, + "grad_norm": 2.4376637935638428, + "learning_rate": 9.257922734747729e-06, + "loss": 0.481, + "num_input_tokens_seen": 5714312, + "step": 8755 + }, + { + "epoch": 5.165094339622642, + "grad_norm": 1.6480398178100586, + "learning_rate": 9.256573509956018e-06, + "loss": 0.418, + "num_input_tokens_seen": 5717320, + "step": 8760 + }, + { + "epoch": 5.168042452830188, + "grad_norm": 1.7073054313659668, + "learning_rate": 9.255223158218127e-06, + "loss": 0.425, + "num_input_tokens_seen": 5722088, + "step": 8765 + }, + { + "epoch": 5.1709905660377355, + "grad_norm": 1.4120420217514038, + "learning_rate": 9.253871679891566e-06, + "loss": 0.3329, + "num_input_tokens_seen": 5725256, + "step": 8770 + }, + { + "epoch": 5.173938679245283, + "grad_norm": 2.0053324699401855, + "learning_rate": 9.252519075334143e-06, + "loss": 0.4566, + "num_input_tokens_seen": 5728744, + "step": 8775 + }, + { + "epoch": 5.17688679245283, + "grad_norm": 2.7283477783203125, + "learning_rate": 9.251165344903969e-06, + "loss": 0.4003, + "num_input_tokens_seen": 5732168, + "step": 8780 + }, + { + "epoch": 5.179834905660377, + "grad_norm": 1.87785804271698, + "learning_rate": 9.249810488959448e-06, + "loss": 0.5422, + "num_input_tokens_seen": 5735752, + "step": 8785 + }, + { + "epoch": 5.182783018867925, + "grad_norm": 1.3296831846237183, + "learning_rate": 9.248454507859285e-06, + "loss": 0.5083, + "num_input_tokens_seen": 5738856, + "step": 8790 + }, + { + "epoch": 5.185731132075472, + "grad_norm": 1.4376671314239502, + "learning_rate": 9.247097401962482e-06, + "loss": 0.4668, + "num_input_tokens_seen": 5742696, + "step": 8795 + }, + { + "epoch": 5.188679245283019, + "grad_norm": 2.4449105262756348, + "learning_rate": 9.245739171628335e-06, + "loss": 0.4625, + "num_input_tokens_seen": 5745640, + "step": 8800 + }, + { + "epoch": 5.191627358490566, + "grad_norm": 2.370718002319336, + "learning_rate": 9.244379817216447e-06, + "loss": 0.4479, + "num_input_tokens_seen": 5749000, + "step": 8805 + }, + { + "epoch": 5.194575471698113, + "grad_norm": 1.3816624879837036, + "learning_rate": 9.243019339086708e-06, + "loss": 0.3377, + "num_input_tokens_seen": 5752808, + "step": 8810 + }, + { + "epoch": 5.19752358490566, + "grad_norm": 2.0664796829223633, + "learning_rate": 9.241657737599313e-06, + "loss": 0.4219, + "num_input_tokens_seen": 5756296, + "step": 8815 + }, + { + "epoch": 5.2004716981132075, + "grad_norm": 2.264397144317627, + "learning_rate": 9.240295013114752e-06, + "loss": 0.4986, + "num_input_tokens_seen": 5758984, + "step": 8820 + }, + { + "epoch": 5.203419811320755, + "grad_norm": 2.9029133319854736, + "learning_rate": 9.238931165993811e-06, + "loss": 0.4544, + "num_input_tokens_seen": 5762472, + "step": 8825 + }, + { + "epoch": 5.206367924528302, + "grad_norm": 3.6382994651794434, + "learning_rate": 9.237566196597577e-06, + "loss": 0.5471, + "num_input_tokens_seen": 5767592, + "step": 8830 + }, + { + "epoch": 5.209316037735849, + "grad_norm": 2.148664951324463, + "learning_rate": 9.236200105287427e-06, + "loss": 0.5388, + "num_input_tokens_seen": 5770792, + "step": 8835 + }, + { + "epoch": 5.212264150943396, + "grad_norm": 2.5313024520874023, + "learning_rate": 9.234832892425042e-06, + "loss": 0.5583, + "num_input_tokens_seen": 5773512, + "step": 8840 + }, + { + "epoch": 5.215212264150943, + "grad_norm": 2.3997085094451904, + "learning_rate": 9.2334645583724e-06, + "loss": 0.5692, + "num_input_tokens_seen": 5776456, + "step": 8845 + }, + { + "epoch": 5.21816037735849, + "grad_norm": 1.5615992546081543, + "learning_rate": 9.23209510349177e-06, + "loss": 0.4607, + "num_input_tokens_seen": 5779976, + "step": 8850 + }, + { + "epoch": 5.221108490566038, + "grad_norm": 1.5125142335891724, + "learning_rate": 9.230724528145722e-06, + "loss": 0.3879, + "num_input_tokens_seen": 5782376, + "step": 8855 + }, + { + "epoch": 5.224056603773585, + "grad_norm": 1.617682695388794, + "learning_rate": 9.229352832697122e-06, + "loss": 0.3876, + "num_input_tokens_seen": 5785704, + "step": 8860 + }, + { + "epoch": 5.227004716981132, + "grad_norm": 1.7704075574874878, + "learning_rate": 9.22798001750913e-06, + "loss": 0.5026, + "num_input_tokens_seen": 5789736, + "step": 8865 + }, + { + "epoch": 5.2299528301886795, + "grad_norm": 2.8150506019592285, + "learning_rate": 9.226606082945209e-06, + "loss": 0.2945, + "num_input_tokens_seen": 5792168, + "step": 8870 + }, + { + "epoch": 5.232900943396227, + "grad_norm": 2.2224793434143066, + "learning_rate": 9.225231029369112e-06, + "loss": 0.4544, + "num_input_tokens_seen": 5795112, + "step": 8875 + }, + { + "epoch": 5.235849056603773, + "grad_norm": 1.9538326263427734, + "learning_rate": 9.22385485714489e-06, + "loss": 0.5022, + "num_input_tokens_seen": 5798216, + "step": 8880 + }, + { + "epoch": 5.2387971698113205, + "grad_norm": 3.0247950553894043, + "learning_rate": 9.222477566636889e-06, + "loss": 0.4593, + "num_input_tokens_seen": 5801448, + "step": 8885 + }, + { + "epoch": 5.241745283018868, + "grad_norm": 2.8815228939056396, + "learning_rate": 9.221099158209757e-06, + "loss": 0.5805, + "num_input_tokens_seen": 5804072, + "step": 8890 + }, + { + "epoch": 5.244693396226415, + "grad_norm": 2.2217464447021484, + "learning_rate": 9.219719632228429e-06, + "loss": 0.4423, + "num_input_tokens_seen": 5808200, + "step": 8895 + }, + { + "epoch": 5.247641509433962, + "grad_norm": 2.813058853149414, + "learning_rate": 9.218338989058141e-06, + "loss": 0.3634, + "num_input_tokens_seen": 5810376, + "step": 8900 + }, + { + "epoch": 5.25058962264151, + "grad_norm": 1.556547999382019, + "learning_rate": 9.21695722906443e-06, + "loss": 0.5097, + "num_input_tokens_seen": 5813448, + "step": 8905 + }, + { + "epoch": 5.253537735849057, + "grad_norm": 1.4735699892044067, + "learning_rate": 9.215574352613115e-06, + "loss": 0.5123, + "num_input_tokens_seen": 5816584, + "step": 8910 + }, + { + "epoch": 5.256485849056604, + "grad_norm": 7.339044570922852, + "learning_rate": 9.214190360070323e-06, + "loss": 0.4735, + "num_input_tokens_seen": 5820424, + "step": 8915 + }, + { + "epoch": 5.259433962264151, + "grad_norm": 1.8211698532104492, + "learning_rate": 9.212805251802471e-06, + "loss": 0.3986, + "num_input_tokens_seen": 5823432, + "step": 8920 + }, + { + "epoch": 5.262382075471698, + "grad_norm": 1.4225010871887207, + "learning_rate": 9.211419028176273e-06, + "loss": 0.4713, + "num_input_tokens_seen": 5826568, + "step": 8925 + }, + { + "epoch": 5.265330188679245, + "grad_norm": 2.2613701820373535, + "learning_rate": 9.210031689558738e-06, + "loss": 0.2991, + "num_input_tokens_seen": 5829256, + "step": 8930 + }, + { + "epoch": 5.2682783018867925, + "grad_norm": 1.9945213794708252, + "learning_rate": 9.208643236317166e-06, + "loss": 0.3871, + "num_input_tokens_seen": 5833160, + "step": 8935 + }, + { + "epoch": 5.27122641509434, + "grad_norm": 1.8355199098587036, + "learning_rate": 9.20725366881916e-06, + "loss": 0.4402, + "num_input_tokens_seen": 5838632, + "step": 8940 + }, + { + "epoch": 5.274174528301887, + "grad_norm": 1.651877999305725, + "learning_rate": 9.205862987432614e-06, + "loss": 0.3734, + "num_input_tokens_seen": 5841416, + "step": 8945 + }, + { + "epoch": 5.277122641509434, + "grad_norm": 2.063458204269409, + "learning_rate": 9.204471192525715e-06, + "loss": 0.4547, + "num_input_tokens_seen": 5844296, + "step": 8950 + }, + { + "epoch": 5.280070754716981, + "grad_norm": 2.0544967651367188, + "learning_rate": 9.203078284466949e-06, + "loss": 0.5152, + "num_input_tokens_seen": 5847944, + "step": 8955 + }, + { + "epoch": 5.283018867924528, + "grad_norm": 1.8793351650238037, + "learning_rate": 9.201684263625091e-06, + "loss": 0.4842, + "num_input_tokens_seen": 5850312, + "step": 8960 + }, + { + "epoch": 5.285966981132075, + "grad_norm": 1.6601569652557373, + "learning_rate": 9.200289130369218e-06, + "loss": 0.4647, + "num_input_tokens_seen": 5853256, + "step": 8965 + }, + { + "epoch": 5.288915094339623, + "grad_norm": 1.887472152709961, + "learning_rate": 9.198892885068693e-06, + "loss": 0.5796, + "num_input_tokens_seen": 5856360, + "step": 8970 + }, + { + "epoch": 5.29186320754717, + "grad_norm": 2.218308687210083, + "learning_rate": 9.197495528093182e-06, + "loss": 0.4164, + "num_input_tokens_seen": 5859304, + "step": 8975 + }, + { + "epoch": 5.294811320754717, + "grad_norm": 1.4595645666122437, + "learning_rate": 9.196097059812639e-06, + "loss": 0.456, + "num_input_tokens_seen": 5862760, + "step": 8980 + }, + { + "epoch": 5.2977594339622645, + "grad_norm": 1.2827430963516235, + "learning_rate": 9.194697480597316e-06, + "loss": 0.4149, + "num_input_tokens_seen": 5865928, + "step": 8985 + }, + { + "epoch": 5.300707547169811, + "grad_norm": 1.9678654670715332, + "learning_rate": 9.193296790817755e-06, + "loss": 0.4285, + "num_input_tokens_seen": 5868296, + "step": 8990 + }, + { + "epoch": 5.303655660377358, + "grad_norm": 1.339626669883728, + "learning_rate": 9.1918949908448e-06, + "loss": 0.3426, + "num_input_tokens_seen": 5871976, + "step": 8995 + }, + { + "epoch": 5.306603773584905, + "grad_norm": 1.176430106163025, + "learning_rate": 9.190492081049578e-06, + "loss": 0.3078, + "num_input_tokens_seen": 5875880, + "step": 9000 + }, + { + "epoch": 5.309551886792453, + "grad_norm": 1.0606156587600708, + "learning_rate": 9.189088061803517e-06, + "loss": 0.3029, + "num_input_tokens_seen": 5878696, + "step": 9005 + }, + { + "epoch": 5.3125, + "grad_norm": 2.1841821670532227, + "learning_rate": 9.187682933478337e-06, + "loss": 0.5204, + "num_input_tokens_seen": 5881640, + "step": 9010 + }, + { + "epoch": 5.315448113207547, + "grad_norm": 2.3424289226531982, + "learning_rate": 9.186276696446054e-06, + "loss": 0.4113, + "num_input_tokens_seen": 5884264, + "step": 9015 + }, + { + "epoch": 5.318396226415095, + "grad_norm": 1.3566397428512573, + "learning_rate": 9.184869351078974e-06, + "loss": 0.4451, + "num_input_tokens_seen": 5888072, + "step": 9020 + }, + { + "epoch": 5.321344339622642, + "grad_norm": 1.379604697227478, + "learning_rate": 9.183460897749697e-06, + "loss": 0.3669, + "num_input_tokens_seen": 5891688, + "step": 9025 + }, + { + "epoch": 5.324292452830189, + "grad_norm": 1.468673586845398, + "learning_rate": 9.182051336831117e-06, + "loss": 0.398, + "num_input_tokens_seen": 5894280, + "step": 9030 + }, + { + "epoch": 5.3272405660377355, + "grad_norm": 2.0117573738098145, + "learning_rate": 9.180640668696424e-06, + "loss": 0.3878, + "num_input_tokens_seen": 5897672, + "step": 9035 + }, + { + "epoch": 5.330188679245283, + "grad_norm": 2.4404358863830566, + "learning_rate": 9.179228893719094e-06, + "loss": 0.5118, + "num_input_tokens_seen": 5900328, + "step": 9040 + }, + { + "epoch": 5.33313679245283, + "grad_norm": 1.687803864479065, + "learning_rate": 9.177816012272904e-06, + "loss": 0.4041, + "num_input_tokens_seen": 5903016, + "step": 9045 + }, + { + "epoch": 5.336084905660377, + "grad_norm": 1.6854829788208008, + "learning_rate": 9.17640202473192e-06, + "loss": 0.589, + "num_input_tokens_seen": 5905864, + "step": 9050 + }, + { + "epoch": 5.339033018867925, + "grad_norm": 3.7648508548736572, + "learning_rate": 9.1749869314705e-06, + "loss": 0.4051, + "num_input_tokens_seen": 5908904, + "step": 9055 + }, + { + "epoch": 5.341981132075472, + "grad_norm": 3.0982625484466553, + "learning_rate": 9.173570732863295e-06, + "loss": 0.5738, + "num_input_tokens_seen": 5912552, + "step": 9060 + }, + { + "epoch": 5.344929245283019, + "grad_norm": 2.485663414001465, + "learning_rate": 9.172153429285254e-06, + "loss": 0.5138, + "num_input_tokens_seen": 5914888, + "step": 9065 + }, + { + "epoch": 5.347877358490566, + "grad_norm": 2.581028938293457, + "learning_rate": 9.17073502111161e-06, + "loss": 0.5569, + "num_input_tokens_seen": 5917480, + "step": 9070 + }, + { + "epoch": 5.350825471698113, + "grad_norm": 2.175549268722534, + "learning_rate": 9.169315508717895e-06, + "loss": 0.431, + "num_input_tokens_seen": 5919944, + "step": 9075 + }, + { + "epoch": 5.35377358490566, + "grad_norm": 2.520141124725342, + "learning_rate": 9.167894892479932e-06, + "loss": 0.4494, + "num_input_tokens_seen": 5923240, + "step": 9080 + }, + { + "epoch": 5.3567216981132075, + "grad_norm": 1.7953218221664429, + "learning_rate": 9.16647317277383e-06, + "loss": 0.4871, + "num_input_tokens_seen": 5926344, + "step": 9085 + }, + { + "epoch": 5.359669811320755, + "grad_norm": 3.2753944396972656, + "learning_rate": 9.165050349976002e-06, + "loss": 0.4199, + "num_input_tokens_seen": 5929224, + "step": 9090 + }, + { + "epoch": 5.362617924528302, + "grad_norm": 1.472706913948059, + "learning_rate": 9.16362642446314e-06, + "loss": 0.4641, + "num_input_tokens_seen": 5933096, + "step": 9095 + }, + { + "epoch": 5.365566037735849, + "grad_norm": 1.5615272521972656, + "learning_rate": 9.162201396612242e-06, + "loss": 0.449, + "num_input_tokens_seen": 5936712, + "step": 9100 + }, + { + "epoch": 5.368514150943396, + "grad_norm": 1.9115259647369385, + "learning_rate": 9.160775266800583e-06, + "loss": 0.6339, + "num_input_tokens_seen": 5939368, + "step": 9105 + }, + { + "epoch": 5.371462264150943, + "grad_norm": 1.7299535274505615, + "learning_rate": 9.159348035405742e-06, + "loss": 0.4358, + "num_input_tokens_seen": 5942408, + "step": 9110 + }, + { + "epoch": 5.37441037735849, + "grad_norm": 2.7527048587799072, + "learning_rate": 9.157919702805582e-06, + "loss": 0.4323, + "num_input_tokens_seen": 5945480, + "step": 9115 + }, + { + "epoch": 5.377358490566038, + "grad_norm": 1.146026611328125, + "learning_rate": 9.156490269378262e-06, + "loss": 0.3451, + "num_input_tokens_seen": 5948872, + "step": 9120 + }, + { + "epoch": 5.380306603773585, + "grad_norm": 2.4105098247528076, + "learning_rate": 9.15505973550223e-06, + "loss": 0.5244, + "num_input_tokens_seen": 5952328, + "step": 9125 + }, + { + "epoch": 5.383254716981132, + "grad_norm": 1.4499263763427734, + "learning_rate": 9.153628101556223e-06, + "loss": 0.4124, + "num_input_tokens_seen": 5955720, + "step": 9130 + }, + { + "epoch": 5.3862028301886795, + "grad_norm": 1.788260817527771, + "learning_rate": 9.152195367919277e-06, + "loss": 0.4264, + "num_input_tokens_seen": 5958376, + "step": 9135 + }, + { + "epoch": 5.389150943396227, + "grad_norm": 2.6777048110961914, + "learning_rate": 9.150761534970713e-06, + "loss": 0.3861, + "num_input_tokens_seen": 5961096, + "step": 9140 + }, + { + "epoch": 5.392099056603773, + "grad_norm": 2.493377208709717, + "learning_rate": 9.149326603090144e-06, + "loss": 0.4391, + "num_input_tokens_seen": 5963944, + "step": 9145 + }, + { + "epoch": 5.3950471698113205, + "grad_norm": 4.22123908996582, + "learning_rate": 9.147890572657471e-06, + "loss": 0.4189, + "num_input_tokens_seen": 5966696, + "step": 9150 + }, + { + "epoch": 5.397995283018868, + "grad_norm": 2.9517319202423096, + "learning_rate": 9.146453444052895e-06, + "loss": 0.6397, + "num_input_tokens_seen": 5969864, + "step": 9155 + }, + { + "epoch": 5.400943396226415, + "grad_norm": 1.664089322090149, + "learning_rate": 9.145015217656899e-06, + "loss": 0.5828, + "num_input_tokens_seen": 5972840, + "step": 9160 + }, + { + "epoch": 5.403891509433962, + "grad_norm": 1.492478370666504, + "learning_rate": 9.14357589385026e-06, + "loss": 0.5123, + "num_input_tokens_seen": 5975464, + "step": 9165 + }, + { + "epoch": 5.40683962264151, + "grad_norm": 1.6935193538665771, + "learning_rate": 9.142135473014046e-06, + "loss": 0.4051, + "num_input_tokens_seen": 5978792, + "step": 9170 + }, + { + "epoch": 5.409787735849057, + "grad_norm": 2.231369733810425, + "learning_rate": 9.140693955529614e-06, + "loss": 0.3788, + "num_input_tokens_seen": 5981672, + "step": 9175 + }, + { + "epoch": 5.412735849056604, + "grad_norm": 1.6467370986938477, + "learning_rate": 9.13925134177861e-06, + "loss": 0.2851, + "num_input_tokens_seen": 5987752, + "step": 9180 + }, + { + "epoch": 5.415683962264151, + "grad_norm": 1.6663157939910889, + "learning_rate": 9.137807632142977e-06, + "loss": 0.3544, + "num_input_tokens_seen": 5990408, + "step": 9185 + }, + { + "epoch": 5.418632075471698, + "grad_norm": 1.2874102592468262, + "learning_rate": 9.136362827004937e-06, + "loss": 0.4482, + "num_input_tokens_seen": 5993704, + "step": 9190 + }, + { + "epoch": 5.421580188679245, + "grad_norm": 2.3429033756256104, + "learning_rate": 9.134916926747015e-06, + "loss": 0.3738, + "num_input_tokens_seen": 5997416, + "step": 9195 + }, + { + "epoch": 5.4245283018867925, + "grad_norm": 2.629138708114624, + "learning_rate": 9.133469931752016e-06, + "loss": 0.5928, + "num_input_tokens_seen": 6000584, + "step": 9200 + }, + { + "epoch": 5.42747641509434, + "grad_norm": 1.18839430809021, + "learning_rate": 9.132021842403035e-06, + "loss": 0.3036, + "num_input_tokens_seen": 6003080, + "step": 9205 + }, + { + "epoch": 5.430424528301887, + "grad_norm": 3.4050393104553223, + "learning_rate": 9.130572659083465e-06, + "loss": 0.4712, + "num_input_tokens_seen": 6006472, + "step": 9210 + }, + { + "epoch": 5.433372641509434, + "grad_norm": 3.0528812408447266, + "learning_rate": 9.129122382176982e-06, + "loss": 0.4492, + "num_input_tokens_seen": 6010440, + "step": 9215 + }, + { + "epoch": 5.436320754716981, + "grad_norm": 1.8928977251052856, + "learning_rate": 9.127671012067554e-06, + "loss": 0.4778, + "num_input_tokens_seen": 6013256, + "step": 9220 + }, + { + "epoch": 5.439268867924528, + "grad_norm": 1.5489469766616821, + "learning_rate": 9.126218549139434e-06, + "loss": 0.4007, + "num_input_tokens_seen": 6017192, + "step": 9225 + }, + { + "epoch": 5.442216981132075, + "grad_norm": 1.6457371711730957, + "learning_rate": 9.124764993777171e-06, + "loss": 0.3953, + "num_input_tokens_seen": 6020264, + "step": 9230 + }, + { + "epoch": 5.445165094339623, + "grad_norm": 2.0792150497436523, + "learning_rate": 9.1233103463656e-06, + "loss": 0.4557, + "num_input_tokens_seen": 6023752, + "step": 9235 + }, + { + "epoch": 5.44811320754717, + "grad_norm": 3.946282148361206, + "learning_rate": 9.121854607289842e-06, + "loss": 0.3912, + "num_input_tokens_seen": 6028328, + "step": 9240 + }, + { + "epoch": 5.451061320754717, + "grad_norm": 2.694544792175293, + "learning_rate": 9.120397776935314e-06, + "loss": 0.5873, + "num_input_tokens_seen": 6031272, + "step": 9245 + }, + { + "epoch": 5.4540094339622645, + "grad_norm": 5.984225749969482, + "learning_rate": 9.118939855687717e-06, + "loss": 0.3539, + "num_input_tokens_seen": 6034472, + "step": 9250 + }, + { + "epoch": 5.456957547169811, + "grad_norm": 2.3550405502319336, + "learning_rate": 9.117480843933043e-06, + "loss": 0.4471, + "num_input_tokens_seen": 6038760, + "step": 9255 + }, + { + "epoch": 5.459905660377358, + "grad_norm": 2.8562934398651123, + "learning_rate": 9.116020742057567e-06, + "loss": 0.4641, + "num_input_tokens_seen": 6042312, + "step": 9260 + }, + { + "epoch": 5.462853773584905, + "grad_norm": 2.145179271697998, + "learning_rate": 9.114559550447863e-06, + "loss": 0.5925, + "num_input_tokens_seen": 6044616, + "step": 9265 + }, + { + "epoch": 5.465801886792453, + "grad_norm": 2.3533828258514404, + "learning_rate": 9.113097269490784e-06, + "loss": 0.5181, + "num_input_tokens_seen": 6048424, + "step": 9270 + }, + { + "epoch": 5.46875, + "grad_norm": 1.1036087274551392, + "learning_rate": 9.111633899573476e-06, + "loss": 0.4099, + "num_input_tokens_seen": 6051016, + "step": 9275 + }, + { + "epoch": 5.471698113207547, + "grad_norm": 2.8446006774902344, + "learning_rate": 9.110169441083374e-06, + "loss": 0.5384, + "num_input_tokens_seen": 6054280, + "step": 9280 + }, + { + "epoch": 5.474646226415095, + "grad_norm": 3.961167812347412, + "learning_rate": 9.108703894408198e-06, + "loss": 0.3662, + "num_input_tokens_seen": 6057128, + "step": 9285 + }, + { + "epoch": 5.477594339622642, + "grad_norm": 3.038400650024414, + "learning_rate": 9.107237259935959e-06, + "loss": 0.4008, + "num_input_tokens_seen": 6060360, + "step": 9290 + }, + { + "epoch": 5.480542452830189, + "grad_norm": 3.3277804851531982, + "learning_rate": 9.105769538054954e-06, + "loss": 0.4012, + "num_input_tokens_seen": 6063976, + "step": 9295 + }, + { + "epoch": 5.4834905660377355, + "grad_norm": 1.2873364686965942, + "learning_rate": 9.104300729153768e-06, + "loss": 0.4109, + "num_input_tokens_seen": 6068488, + "step": 9300 + }, + { + "epoch": 5.486438679245283, + "grad_norm": 1.5659147500991821, + "learning_rate": 9.102830833621277e-06, + "loss": 0.4521, + "num_input_tokens_seen": 6071944, + "step": 9305 + }, + { + "epoch": 5.48938679245283, + "grad_norm": 1.8612449169158936, + "learning_rate": 9.101359851846639e-06, + "loss": 0.4442, + "num_input_tokens_seen": 6075144, + "step": 9310 + }, + { + "epoch": 5.492334905660377, + "grad_norm": 3.327059745788574, + "learning_rate": 9.099887784219305e-06, + "loss": 0.5017, + "num_input_tokens_seen": 6078952, + "step": 9315 + }, + { + "epoch": 5.495283018867925, + "grad_norm": 1.9221395254135132, + "learning_rate": 9.098414631129012e-06, + "loss": 0.3857, + "num_input_tokens_seen": 6081736, + "step": 9320 + }, + { + "epoch": 5.498231132075472, + "grad_norm": 1.7748950719833374, + "learning_rate": 9.09694039296578e-06, + "loss": 0.3306, + "num_input_tokens_seen": 6085128, + "step": 9325 + }, + { + "epoch": 5.501179245283019, + "grad_norm": 2.848676919937134, + "learning_rate": 9.095465070119924e-06, + "loss": 0.5741, + "num_input_tokens_seen": 6088680, + "step": 9330 + }, + { + "epoch": 5.504127358490566, + "grad_norm": 2.822033166885376, + "learning_rate": 9.09398866298204e-06, + "loss": 0.6311, + "num_input_tokens_seen": 6091880, + "step": 9335 + }, + { + "epoch": 5.507075471698113, + "grad_norm": 1.184868574142456, + "learning_rate": 9.09251117194301e-06, + "loss": 0.424, + "num_input_tokens_seen": 6095752, + "step": 9340 + }, + { + "epoch": 5.51002358490566, + "grad_norm": 1.5121214389801025, + "learning_rate": 9.091032597394012e-06, + "loss": 0.4407, + "num_input_tokens_seen": 6099176, + "step": 9345 + }, + { + "epoch": 5.5129716981132075, + "grad_norm": 2.7798593044281006, + "learning_rate": 9.089552939726503e-06, + "loss": 0.4979, + "num_input_tokens_seen": 6102152, + "step": 9350 + }, + { + "epoch": 5.515919811320755, + "grad_norm": 1.714670181274414, + "learning_rate": 9.088072199332227e-06, + "loss": 0.5228, + "num_input_tokens_seen": 6105256, + "step": 9355 + }, + { + "epoch": 5.518867924528302, + "grad_norm": 2.2994697093963623, + "learning_rate": 9.08659037660322e-06, + "loss": 0.4303, + "num_input_tokens_seen": 6108488, + "step": 9360 + }, + { + "epoch": 5.521816037735849, + "grad_norm": 1.9496381282806396, + "learning_rate": 9.085107471931797e-06, + "loss": 0.385, + "num_input_tokens_seen": 6113256, + "step": 9365 + }, + { + "epoch": 5.524764150943396, + "grad_norm": 2.571505546569824, + "learning_rate": 9.083623485710564e-06, + "loss": 0.4151, + "num_input_tokens_seen": 6115848, + "step": 9370 + }, + { + "epoch": 5.527712264150943, + "grad_norm": 1.8417683839797974, + "learning_rate": 9.082138418332416e-06, + "loss": 0.4596, + "num_input_tokens_seen": 6118216, + "step": 9375 + }, + { + "epoch": 5.53066037735849, + "grad_norm": 2.3361246585845947, + "learning_rate": 9.080652270190527e-06, + "loss": 0.3994, + "num_input_tokens_seen": 6120520, + "step": 9380 + }, + { + "epoch": 5.533608490566038, + "grad_norm": 2.540174961090088, + "learning_rate": 9.079165041678363e-06, + "loss": 0.3377, + "num_input_tokens_seen": 6123880, + "step": 9385 + }, + { + "epoch": 5.536556603773585, + "grad_norm": 1.8295546770095825, + "learning_rate": 9.077676733189675e-06, + "loss": 0.4339, + "num_input_tokens_seen": 6127528, + "step": 9390 + }, + { + "epoch": 5.539504716981132, + "grad_norm": 1.9093514680862427, + "learning_rate": 9.076187345118496e-06, + "loss": 0.3373, + "num_input_tokens_seen": 6129864, + "step": 9395 + }, + { + "epoch": 5.5424528301886795, + "grad_norm": 2.539487361907959, + "learning_rate": 9.074696877859152e-06, + "loss": 0.5797, + "num_input_tokens_seen": 6133704, + "step": 9400 + }, + { + "epoch": 5.545400943396227, + "grad_norm": 0.9788433909416199, + "learning_rate": 9.073205331806248e-06, + "loss": 0.3739, + "num_input_tokens_seen": 6136968, + "step": 9405 + }, + { + "epoch": 5.548349056603773, + "grad_norm": 3.93035626411438, + "learning_rate": 9.071712707354676e-06, + "loss": 0.3247, + "num_input_tokens_seen": 6139880, + "step": 9410 + }, + { + "epoch": 5.5512971698113205, + "grad_norm": 1.3705909252166748, + "learning_rate": 9.070219004899618e-06, + "loss": 0.5753, + "num_input_tokens_seen": 6143112, + "step": 9415 + }, + { + "epoch": 5.554245283018868, + "grad_norm": 3.2309653759002686, + "learning_rate": 9.068724224836538e-06, + "loss": 0.3399, + "num_input_tokens_seen": 6146728, + "step": 9420 + }, + { + "epoch": 5.557193396226415, + "grad_norm": 3.318100690841675, + "learning_rate": 9.067228367561182e-06, + "loss": 0.3981, + "num_input_tokens_seen": 6149192, + "step": 9425 + }, + { + "epoch": 5.560141509433962, + "grad_norm": 2.2322585582733154, + "learning_rate": 9.06573143346959e-06, + "loss": 0.5887, + "num_input_tokens_seen": 6155560, + "step": 9430 + }, + { + "epoch": 5.56308962264151, + "grad_norm": 2.0942013263702393, + "learning_rate": 9.064233422958078e-06, + "loss": 0.4602, + "num_input_tokens_seen": 6159080, + "step": 9435 + }, + { + "epoch": 5.566037735849057, + "grad_norm": 2.2779343128204346, + "learning_rate": 9.062734336423248e-06, + "loss": 0.4605, + "num_input_tokens_seen": 6162632, + "step": 9440 + }, + { + "epoch": 5.568985849056604, + "grad_norm": 3.158295154571533, + "learning_rate": 9.061234174261998e-06, + "loss": 0.7057, + "num_input_tokens_seen": 6165928, + "step": 9445 + }, + { + "epoch": 5.571933962264151, + "grad_norm": 2.1458961963653564, + "learning_rate": 9.059732936871493e-06, + "loss": 0.459, + "num_input_tokens_seen": 6168680, + "step": 9450 + }, + { + "epoch": 5.574882075471698, + "grad_norm": 1.3610783815383911, + "learning_rate": 9.058230624649198e-06, + "loss": 0.5412, + "num_input_tokens_seen": 6171720, + "step": 9455 + }, + { + "epoch": 5.577830188679245, + "grad_norm": 1.3831926584243774, + "learning_rate": 9.056727237992856e-06, + "loss": 0.357, + "num_input_tokens_seen": 6175688, + "step": 9460 + }, + { + "epoch": 5.5807783018867925, + "grad_norm": 2.4988064765930176, + "learning_rate": 9.055222777300493e-06, + "loss": 0.4673, + "num_input_tokens_seen": 6178984, + "step": 9465 + }, + { + "epoch": 5.58372641509434, + "grad_norm": 1.4757087230682373, + "learning_rate": 9.053717242970423e-06, + "loss": 0.4402, + "num_input_tokens_seen": 6182184, + "step": 9470 + }, + { + "epoch": 5.586674528301887, + "grad_norm": 1.7486714124679565, + "learning_rate": 9.052210635401244e-06, + "loss": 0.7418, + "num_input_tokens_seen": 6184776, + "step": 9475 + }, + { + "epoch": 5.589622641509434, + "grad_norm": 2.719128131866455, + "learning_rate": 9.050702954991833e-06, + "loss": 0.4825, + "num_input_tokens_seen": 6187528, + "step": 9480 + }, + { + "epoch": 5.592570754716981, + "grad_norm": 2.127504348754883, + "learning_rate": 9.049194202141358e-06, + "loss": 0.448, + "num_input_tokens_seen": 6190472, + "step": 9485 + }, + { + "epoch": 5.595518867924528, + "grad_norm": 2.7241947650909424, + "learning_rate": 9.047684377249267e-06, + "loss": 0.453, + "num_input_tokens_seen": 6193192, + "step": 9490 + }, + { + "epoch": 5.598466981132075, + "grad_norm": 1.6014137268066406, + "learning_rate": 9.046173480715292e-06, + "loss": 0.5519, + "num_input_tokens_seen": 6196200, + "step": 9495 + }, + { + "epoch": 5.601415094339623, + "grad_norm": 1.880471110343933, + "learning_rate": 9.044661512939451e-06, + "loss": 0.3322, + "num_input_tokens_seen": 6198856, + "step": 9500 + }, + { + "epoch": 5.60436320754717, + "grad_norm": 1.6292113065719604, + "learning_rate": 9.043148474322043e-06, + "loss": 0.6234, + "num_input_tokens_seen": 6202024, + "step": 9505 + }, + { + "epoch": 5.607311320754717, + "grad_norm": 1.6045300960540771, + "learning_rate": 9.04163436526365e-06, + "loss": 0.5384, + "num_input_tokens_seen": 6205352, + "step": 9510 + }, + { + "epoch": 5.6102594339622645, + "grad_norm": 1.8734464645385742, + "learning_rate": 9.040119186165142e-06, + "loss": 0.4535, + "num_input_tokens_seen": 6207848, + "step": 9515 + }, + { + "epoch": 5.613207547169811, + "grad_norm": 1.3760719299316406, + "learning_rate": 9.038602937427665e-06, + "loss": 0.3904, + "num_input_tokens_seen": 6210920, + "step": 9520 + }, + { + "epoch": 5.616155660377358, + "grad_norm": 1.415008783340454, + "learning_rate": 9.037085619452658e-06, + "loss": 0.6985, + "num_input_tokens_seen": 6215432, + "step": 9525 + }, + { + "epoch": 5.619103773584905, + "grad_norm": 3.0020735263824463, + "learning_rate": 9.035567232641833e-06, + "loss": 0.3224, + "num_input_tokens_seen": 6218760, + "step": 9530 + }, + { + "epoch": 5.622051886792453, + "grad_norm": 1.5325114727020264, + "learning_rate": 9.03404777739719e-06, + "loss": 0.4155, + "num_input_tokens_seen": 6222952, + "step": 9535 + }, + { + "epoch": 5.625, + "grad_norm": 0.9342206120491028, + "learning_rate": 9.032527254121013e-06, + "loss": 0.4727, + "num_input_tokens_seen": 6226376, + "step": 9540 + }, + { + "epoch": 5.627948113207547, + "grad_norm": 0.868590772151947, + "learning_rate": 9.031005663215867e-06, + "loss": 0.3267, + "num_input_tokens_seen": 6229448, + "step": 9545 + }, + { + "epoch": 5.630896226415095, + "grad_norm": 2.573518753051758, + "learning_rate": 9.029483005084595e-06, + "loss": 0.4122, + "num_input_tokens_seen": 6233224, + "step": 9550 + }, + { + "epoch": 5.633844339622642, + "grad_norm": 1.670257806777954, + "learning_rate": 9.027959280130337e-06, + "loss": 0.4086, + "num_input_tokens_seen": 6236392, + "step": 9555 + }, + { + "epoch": 5.636792452830189, + "grad_norm": 1.5887823104858398, + "learning_rate": 9.026434488756496e-06, + "loss": 0.469, + "num_input_tokens_seen": 6239112, + "step": 9560 + }, + { + "epoch": 5.6397405660377355, + "grad_norm": 1.4977518320083618, + "learning_rate": 9.024908631366774e-06, + "loss": 0.518, + "num_input_tokens_seen": 6242568, + "step": 9565 + }, + { + "epoch": 5.642688679245283, + "grad_norm": 2.1194980144500732, + "learning_rate": 9.023381708365143e-06, + "loss": 0.406, + "num_input_tokens_seen": 6245640, + "step": 9570 + }, + { + "epoch": 5.64563679245283, + "grad_norm": 2.9444210529327393, + "learning_rate": 9.021853720155866e-06, + "loss": 0.3532, + "num_input_tokens_seen": 6248744, + "step": 9575 + }, + { + "epoch": 5.648584905660377, + "grad_norm": 2.082366943359375, + "learning_rate": 9.020324667143483e-06, + "loss": 0.5186, + "num_input_tokens_seen": 6251624, + "step": 9580 + }, + { + "epoch": 5.651533018867925, + "grad_norm": 3.273745059967041, + "learning_rate": 9.018794549732819e-06, + "loss": 0.367, + "num_input_tokens_seen": 6254664, + "step": 9585 + }, + { + "epoch": 5.654481132075472, + "grad_norm": 1.9303058385849, + "learning_rate": 9.017263368328977e-06, + "loss": 0.4848, + "num_input_tokens_seen": 6257928, + "step": 9590 + }, + { + "epoch": 5.657429245283019, + "grad_norm": 1.6684821844100952, + "learning_rate": 9.015731123337344e-06, + "loss": 0.4894, + "num_input_tokens_seen": 6261416, + "step": 9595 + }, + { + "epoch": 5.660377358490566, + "grad_norm": 1.4447792768478394, + "learning_rate": 9.01419781516359e-06, + "loss": 0.3968, + "num_input_tokens_seen": 6264520, + "step": 9600 + }, + { + "epoch": 5.663325471698113, + "grad_norm": 2.670275926589966, + "learning_rate": 9.012663444213664e-06, + "loss": 0.4933, + "num_input_tokens_seen": 6267944, + "step": 9605 + }, + { + "epoch": 5.66627358490566, + "grad_norm": 2.5596470832824707, + "learning_rate": 9.011128010893797e-06, + "loss": 0.4628, + "num_input_tokens_seen": 6271208, + "step": 9610 + }, + { + "epoch": 5.6692216981132075, + "grad_norm": 1.652653455734253, + "learning_rate": 9.009591515610503e-06, + "loss": 0.4661, + "num_input_tokens_seen": 6274312, + "step": 9615 + }, + { + "epoch": 5.672169811320755, + "grad_norm": 3.2180609703063965, + "learning_rate": 9.008053958770575e-06, + "loss": 0.4437, + "num_input_tokens_seen": 6277256, + "step": 9620 + }, + { + "epoch": 5.675117924528302, + "grad_norm": 2.7684619426727295, + "learning_rate": 9.006515340781087e-06, + "loss": 0.4205, + "num_input_tokens_seen": 6280712, + "step": 9625 + }, + { + "epoch": 5.678066037735849, + "grad_norm": 1.142978310585022, + "learning_rate": 9.004975662049396e-06, + "loss": 0.3217, + "num_input_tokens_seen": 6283816, + "step": 9630 + }, + { + "epoch": 5.681014150943396, + "grad_norm": 3.529292106628418, + "learning_rate": 9.003434922983138e-06, + "loss": 0.4349, + "num_input_tokens_seen": 6286248, + "step": 9635 + }, + { + "epoch": 5.683962264150943, + "grad_norm": 1.9229927062988281, + "learning_rate": 9.00189312399023e-06, + "loss": 0.4269, + "num_input_tokens_seen": 6289096, + "step": 9640 + }, + { + "epoch": 5.68691037735849, + "grad_norm": 1.792043924331665, + "learning_rate": 9.00035026547887e-06, + "loss": 0.3732, + "num_input_tokens_seen": 6292392, + "step": 9645 + }, + { + "epoch": 5.689858490566038, + "grad_norm": 3.2065787315368652, + "learning_rate": 8.998806347857537e-06, + "loss": 0.469, + "num_input_tokens_seen": 6295304, + "step": 9650 + }, + { + "epoch": 5.692806603773585, + "grad_norm": 1.8897875547409058, + "learning_rate": 8.99726137153499e-06, + "loss": 0.4393, + "num_input_tokens_seen": 6298088, + "step": 9655 + }, + { + "epoch": 5.695754716981132, + "grad_norm": 2.4694504737854004, + "learning_rate": 8.995715336920266e-06, + "loss": 0.4891, + "num_input_tokens_seen": 6301384, + "step": 9660 + }, + { + "epoch": 5.6987028301886795, + "grad_norm": 2.761639356613159, + "learning_rate": 8.994168244422687e-06, + "loss": 0.5764, + "num_input_tokens_seen": 6304392, + "step": 9665 + }, + { + "epoch": 5.701650943396227, + "grad_norm": 1.1657084226608276, + "learning_rate": 8.992620094451852e-06, + "loss": 0.378, + "num_input_tokens_seen": 6308552, + "step": 9670 + }, + { + "epoch": 5.704599056603773, + "grad_norm": 3.3550541400909424, + "learning_rate": 8.991070887417639e-06, + "loss": 0.4368, + "num_input_tokens_seen": 6311368, + "step": 9675 + }, + { + "epoch": 5.7075471698113205, + "grad_norm": 2.946260452270508, + "learning_rate": 8.989520623730208e-06, + "loss": 0.4913, + "num_input_tokens_seen": 6315112, + "step": 9680 + }, + { + "epoch": 5.710495283018868, + "grad_norm": 1.4334410429000854, + "learning_rate": 8.987969303799998e-06, + "loss": 0.5032, + "num_input_tokens_seen": 6319176, + "step": 9685 + }, + { + "epoch": 5.713443396226415, + "grad_norm": 3.05387282371521, + "learning_rate": 8.986416928037728e-06, + "loss": 0.4111, + "num_input_tokens_seen": 6321832, + "step": 9690 + }, + { + "epoch": 5.716391509433962, + "grad_norm": 1.658649206161499, + "learning_rate": 8.984863496854395e-06, + "loss": 0.4271, + "num_input_tokens_seen": 6324648, + "step": 9695 + }, + { + "epoch": 5.71933962264151, + "grad_norm": 2.1617074012756348, + "learning_rate": 8.983309010661279e-06, + "loss": 0.4236, + "num_input_tokens_seen": 6327304, + "step": 9700 + }, + { + "epoch": 5.722287735849057, + "grad_norm": 1.6698461771011353, + "learning_rate": 8.981753469869934e-06, + "loss": 0.6688, + "num_input_tokens_seen": 6330504, + "step": 9705 + }, + { + "epoch": 5.725235849056604, + "grad_norm": 2.268144369125366, + "learning_rate": 8.980196874892198e-06, + "loss": 0.4468, + "num_input_tokens_seen": 6333192, + "step": 9710 + }, + { + "epoch": 5.728183962264151, + "grad_norm": 1.8669620752334595, + "learning_rate": 8.978639226140184e-06, + "loss": 0.3965, + "num_input_tokens_seen": 6336488, + "step": 9715 + }, + { + "epoch": 5.731132075471698, + "grad_norm": 2.112658739089966, + "learning_rate": 8.977080524026289e-06, + "loss": 0.3408, + "num_input_tokens_seen": 6340296, + "step": 9720 + }, + { + "epoch": 5.734080188679245, + "grad_norm": 1.7725245952606201, + "learning_rate": 8.975520768963186e-06, + "loss": 0.5933, + "num_input_tokens_seen": 6342664, + "step": 9725 + }, + { + "epoch": 5.7370283018867925, + "grad_norm": 2.735645055770874, + "learning_rate": 8.973959961363825e-06, + "loss": 0.4087, + "num_input_tokens_seen": 6345384, + "step": 9730 + }, + { + "epoch": 5.73997641509434, + "grad_norm": 2.775350332260132, + "learning_rate": 8.972398101641438e-06, + "loss": 0.4258, + "num_input_tokens_seen": 6348104, + "step": 9735 + }, + { + "epoch": 5.742924528301887, + "grad_norm": 2.370429039001465, + "learning_rate": 8.970835190209532e-06, + "loss": 0.4466, + "num_input_tokens_seen": 6355592, + "step": 9740 + }, + { + "epoch": 5.745872641509434, + "grad_norm": 4.120883464813232, + "learning_rate": 8.969271227481899e-06, + "loss": 0.5416, + "num_input_tokens_seen": 6359144, + "step": 9745 + }, + { + "epoch": 5.748820754716981, + "grad_norm": 2.135887384414673, + "learning_rate": 8.967706213872599e-06, + "loss": 0.4149, + "num_input_tokens_seen": 6362696, + "step": 9750 + }, + { + "epoch": 5.751768867924528, + "grad_norm": 1.5834332704544067, + "learning_rate": 8.966140149795981e-06, + "loss": 0.4449, + "num_input_tokens_seen": 6366120, + "step": 9755 + }, + { + "epoch": 5.754716981132075, + "grad_norm": 1.0197964906692505, + "learning_rate": 8.964573035666663e-06, + "loss": 0.3594, + "num_input_tokens_seen": 6369224, + "step": 9760 + }, + { + "epoch": 5.757665094339623, + "grad_norm": 3.677851676940918, + "learning_rate": 8.96300487189955e-06, + "loss": 0.4247, + "num_input_tokens_seen": 6372328, + "step": 9765 + }, + { + "epoch": 5.76061320754717, + "grad_norm": 2.2238149642944336, + "learning_rate": 8.961435658909816e-06, + "loss": 0.5206, + "num_input_tokens_seen": 6375368, + "step": 9770 + }, + { + "epoch": 5.763561320754717, + "grad_norm": 1.830399990081787, + "learning_rate": 8.959865397112918e-06, + "loss": 0.4524, + "num_input_tokens_seen": 6378440, + "step": 9775 + }, + { + "epoch": 5.7665094339622645, + "grad_norm": 2.8615005016326904, + "learning_rate": 8.95829408692459e-06, + "loss": 0.4644, + "num_input_tokens_seen": 6381096, + "step": 9780 + }, + { + "epoch": 5.769457547169811, + "grad_norm": 2.100790500640869, + "learning_rate": 8.956721728760845e-06, + "loss": 0.3863, + "num_input_tokens_seen": 6384328, + "step": 9785 + }, + { + "epoch": 5.772405660377358, + "grad_norm": 2.152247667312622, + "learning_rate": 8.95514832303797e-06, + "loss": 0.5591, + "num_input_tokens_seen": 6388232, + "step": 9790 + }, + { + "epoch": 5.775353773584905, + "grad_norm": 3.9218504428863525, + "learning_rate": 8.953573870172528e-06, + "loss": 0.3714, + "num_input_tokens_seen": 6390952, + "step": 9795 + }, + { + "epoch": 5.778301886792453, + "grad_norm": 3.4655866622924805, + "learning_rate": 8.951998370581368e-06, + "loss": 0.3962, + "num_input_tokens_seen": 6394472, + "step": 9800 + }, + { + "epoch": 5.78125, + "grad_norm": 1.9657557010650635, + "learning_rate": 8.950421824681605e-06, + "loss": 0.5625, + "num_input_tokens_seen": 6398312, + "step": 9805 + }, + { + "epoch": 5.784198113207547, + "grad_norm": 2.6667251586914062, + "learning_rate": 8.948844232890638e-06, + "loss": 0.4149, + "num_input_tokens_seen": 6401768, + "step": 9810 + }, + { + "epoch": 5.787146226415095, + "grad_norm": 1.686158537864685, + "learning_rate": 8.947265595626144e-06, + "loss": 0.5064, + "num_input_tokens_seen": 6404872, + "step": 9815 + }, + { + "epoch": 5.790094339622642, + "grad_norm": 2.5049242973327637, + "learning_rate": 8.945685913306071e-06, + "loss": 0.4599, + "num_input_tokens_seen": 6408296, + "step": 9820 + }, + { + "epoch": 5.793042452830189, + "grad_norm": 2.334519386291504, + "learning_rate": 8.944105186348646e-06, + "loss": 0.3621, + "num_input_tokens_seen": 6411464, + "step": 9825 + }, + { + "epoch": 5.7959905660377355, + "grad_norm": 1.850448489189148, + "learning_rate": 8.942523415172377e-06, + "loss": 0.4174, + "num_input_tokens_seen": 6414408, + "step": 9830 + }, + { + "epoch": 5.798938679245283, + "grad_norm": 2.590169668197632, + "learning_rate": 8.94094060019604e-06, + "loss": 0.3615, + "num_input_tokens_seen": 6417448, + "step": 9835 + }, + { + "epoch": 5.80188679245283, + "grad_norm": 2.3526647090911865, + "learning_rate": 8.939356741838696e-06, + "loss": 0.4851, + "num_input_tokens_seen": 6420936, + "step": 9840 + }, + { + "epoch": 5.804834905660377, + "grad_norm": 2.5362000465393066, + "learning_rate": 8.937771840519677e-06, + "loss": 0.3226, + "num_input_tokens_seen": 6424264, + "step": 9845 + }, + { + "epoch": 5.807783018867925, + "grad_norm": 1.8015542030334473, + "learning_rate": 8.936185896658593e-06, + "loss": 0.411, + "num_input_tokens_seen": 6426920, + "step": 9850 + }, + { + "epoch": 5.810731132075472, + "grad_norm": 2.584254741668701, + "learning_rate": 8.934598910675329e-06, + "loss": 0.3852, + "num_input_tokens_seen": 6430216, + "step": 9855 + }, + { + "epoch": 5.813679245283019, + "grad_norm": 1.8372210264205933, + "learning_rate": 8.933010882990044e-06, + "loss": 0.4039, + "num_input_tokens_seen": 6434088, + "step": 9860 + }, + { + "epoch": 5.816627358490566, + "grad_norm": 1.3932268619537354, + "learning_rate": 8.93142181402318e-06, + "loss": 0.367, + "num_input_tokens_seen": 6437640, + "step": 9865 + }, + { + "epoch": 5.819575471698113, + "grad_norm": 2.291452646255493, + "learning_rate": 8.929831704195445e-06, + "loss": 0.4027, + "num_input_tokens_seen": 6440744, + "step": 9870 + }, + { + "epoch": 5.82252358490566, + "grad_norm": 2.886775493621826, + "learning_rate": 8.928240553927831e-06, + "loss": 0.3891, + "num_input_tokens_seen": 6443208, + "step": 9875 + }, + { + "epoch": 5.8254716981132075, + "grad_norm": 2.408482551574707, + "learning_rate": 8.926648363641602e-06, + "loss": 0.3448, + "num_input_tokens_seen": 6446312, + "step": 9880 + }, + { + "epoch": 5.828419811320755, + "grad_norm": 2.5916571617126465, + "learning_rate": 8.925055133758294e-06, + "loss": 0.5208, + "num_input_tokens_seen": 6449640, + "step": 9885 + }, + { + "epoch": 5.831367924528302, + "grad_norm": 2.447073221206665, + "learning_rate": 8.923460864699723e-06, + "loss": 0.4145, + "num_input_tokens_seen": 6453384, + "step": 9890 + }, + { + "epoch": 5.834316037735849, + "grad_norm": 1.3931211233139038, + "learning_rate": 8.921865556887979e-06, + "loss": 0.5891, + "num_input_tokens_seen": 6455880, + "step": 9895 + }, + { + "epoch": 5.837264150943396, + "grad_norm": 1.6746543645858765, + "learning_rate": 8.920269210745426e-06, + "loss": 0.4462, + "num_input_tokens_seen": 6458984, + "step": 9900 + }, + { + "epoch": 5.840212264150943, + "grad_norm": 1.170090675354004, + "learning_rate": 8.918671826694704e-06, + "loss": 0.3977, + "num_input_tokens_seen": 6462344, + "step": 9905 + }, + { + "epoch": 5.84316037735849, + "grad_norm": 1.2894397974014282, + "learning_rate": 8.91707340515873e-06, + "loss": 0.4414, + "num_input_tokens_seen": 6465288, + "step": 9910 + }, + { + "epoch": 5.846108490566038, + "grad_norm": 1.420104742050171, + "learning_rate": 8.915473946560688e-06, + "loss": 0.4247, + "num_input_tokens_seen": 6470728, + "step": 9915 + }, + { + "epoch": 5.849056603773585, + "grad_norm": 4.945779323577881, + "learning_rate": 8.913873451324044e-06, + "loss": 0.5315, + "num_input_tokens_seen": 6474376, + "step": 9920 + }, + { + "epoch": 5.852004716981132, + "grad_norm": 1.4195010662078857, + "learning_rate": 8.912271919872538e-06, + "loss": 0.3625, + "num_input_tokens_seen": 6477672, + "step": 9925 + }, + { + "epoch": 5.8549528301886795, + "grad_norm": 2.7578988075256348, + "learning_rate": 8.910669352630176e-06, + "loss": 0.5733, + "num_input_tokens_seen": 6480136, + "step": 9930 + }, + { + "epoch": 5.857900943396227, + "grad_norm": 1.6564258337020874, + "learning_rate": 8.909065750021253e-06, + "loss": 0.3553, + "num_input_tokens_seen": 6483240, + "step": 9935 + }, + { + "epoch": 5.860849056603773, + "grad_norm": 3.189434289932251, + "learning_rate": 8.907461112470323e-06, + "loss": 0.5216, + "num_input_tokens_seen": 6486600, + "step": 9940 + }, + { + "epoch": 5.8637971698113205, + "grad_norm": 1.4162263870239258, + "learning_rate": 8.905855440402225e-06, + "loss": 0.4096, + "num_input_tokens_seen": 6489128, + "step": 9945 + }, + { + "epoch": 5.866745283018868, + "grad_norm": 2.5725746154785156, + "learning_rate": 8.904248734242065e-06, + "loss": 0.5143, + "num_input_tokens_seen": 6491976, + "step": 9950 + }, + { + "epoch": 5.869693396226415, + "grad_norm": 1.9021679162979126, + "learning_rate": 8.902640994415226e-06, + "loss": 0.4387, + "num_input_tokens_seen": 6495464, + "step": 9955 + }, + { + "epoch": 5.872641509433962, + "grad_norm": 4.451745986938477, + "learning_rate": 8.901032221347364e-06, + "loss": 0.4629, + "num_input_tokens_seen": 6498344, + "step": 9960 + }, + { + "epoch": 5.87558962264151, + "grad_norm": 2.2210371494293213, + "learning_rate": 8.899422415464409e-06, + "loss": 0.4315, + "num_input_tokens_seen": 6501864, + "step": 9965 + }, + { + "epoch": 5.878537735849057, + "grad_norm": 1.5052103996276855, + "learning_rate": 8.897811577192565e-06, + "loss": 0.5162, + "num_input_tokens_seen": 6504776, + "step": 9970 + }, + { + "epoch": 5.881485849056604, + "grad_norm": 1.7460362911224365, + "learning_rate": 8.896199706958306e-06, + "loss": 0.3986, + "num_input_tokens_seen": 6507912, + "step": 9975 + }, + { + "epoch": 5.884433962264151, + "grad_norm": 4.087330341339111, + "learning_rate": 8.894586805188384e-06, + "loss": 0.4826, + "num_input_tokens_seen": 6510632, + "step": 9980 + }, + { + "epoch": 5.887382075471698, + "grad_norm": 1.4754745960235596, + "learning_rate": 8.892972872309821e-06, + "loss": 0.3284, + "num_input_tokens_seen": 6513640, + "step": 9985 + }, + { + "epoch": 5.890330188679245, + "grad_norm": 2.0166385173797607, + "learning_rate": 8.89135790874991e-06, + "loss": 0.4495, + "num_input_tokens_seen": 6517384, + "step": 9990 + }, + { + "epoch": 5.8932783018867925, + "grad_norm": 2.142604351043701, + "learning_rate": 8.889741914936224e-06, + "loss": 0.4596, + "num_input_tokens_seen": 6520680, + "step": 9995 + }, + { + "epoch": 5.89622641509434, + "grad_norm": 1.4938353300094604, + "learning_rate": 8.888124891296602e-06, + "loss": 0.3534, + "num_input_tokens_seen": 6524264, + "step": 10000 + }, + { + "epoch": 5.899174528301887, + "grad_norm": 2.2481894493103027, + "learning_rate": 8.886506838259156e-06, + "loss": 0.4953, + "num_input_tokens_seen": 6527624, + "step": 10005 + }, + { + "epoch": 5.902122641509434, + "grad_norm": 2.555413246154785, + "learning_rate": 8.884887756252279e-06, + "loss": 0.4272, + "num_input_tokens_seen": 6530184, + "step": 10010 + }, + { + "epoch": 5.905070754716981, + "grad_norm": 2.8460235595703125, + "learning_rate": 8.88326764570462e-06, + "loss": 0.3933, + "num_input_tokens_seen": 6534152, + "step": 10015 + }, + { + "epoch": 5.908018867924528, + "grad_norm": 1.5925590991973877, + "learning_rate": 8.88164650704512e-06, + "loss": 0.3367, + "num_input_tokens_seen": 6537544, + "step": 10020 + }, + { + "epoch": 5.910966981132075, + "grad_norm": 1.4225480556488037, + "learning_rate": 8.880024340702978e-06, + "loss": 0.5126, + "num_input_tokens_seen": 6540232, + "step": 10025 + }, + { + "epoch": 5.913915094339623, + "grad_norm": 2.6845648288726807, + "learning_rate": 8.878401147107667e-06, + "loss": 0.3383, + "num_input_tokens_seen": 6544328, + "step": 10030 + }, + { + "epoch": 5.91686320754717, + "grad_norm": 1.5596427917480469, + "learning_rate": 8.87677692668894e-06, + "loss": 0.442, + "num_input_tokens_seen": 6547624, + "step": 10035 + }, + { + "epoch": 5.919811320754717, + "grad_norm": 3.904777765274048, + "learning_rate": 8.875151679876813e-06, + "loss": 0.6714, + "num_input_tokens_seen": 6551080, + "step": 10040 + }, + { + "epoch": 5.9227594339622645, + "grad_norm": 2.436663866043091, + "learning_rate": 8.873525407101577e-06, + "loss": 0.4746, + "num_input_tokens_seen": 6554120, + "step": 10045 + }, + { + "epoch": 5.925707547169811, + "grad_norm": 2.0067825317382812, + "learning_rate": 8.871898108793796e-06, + "loss": 0.4872, + "num_input_tokens_seen": 6557000, + "step": 10050 + }, + { + "epoch": 5.928655660377358, + "grad_norm": 1.824528694152832, + "learning_rate": 8.870269785384304e-06, + "loss": 0.4556, + "num_input_tokens_seen": 6560328, + "step": 10055 + }, + { + "epoch": 5.931603773584905, + "grad_norm": 2.4293510913848877, + "learning_rate": 8.868640437304206e-06, + "loss": 0.4022, + "num_input_tokens_seen": 6563432, + "step": 10060 + }, + { + "epoch": 5.934551886792453, + "grad_norm": 1.5488672256469727, + "learning_rate": 8.86701006498488e-06, + "loss": 0.5331, + "num_input_tokens_seen": 6566472, + "step": 10065 + }, + { + "epoch": 5.9375, + "grad_norm": 1.7801010608673096, + "learning_rate": 8.865378668857972e-06, + "loss": 0.4415, + "num_input_tokens_seen": 6570376, + "step": 10070 + }, + { + "epoch": 5.940448113207547, + "grad_norm": 1.7903923988342285, + "learning_rate": 8.863746249355404e-06, + "loss": 0.6459, + "num_input_tokens_seen": 6572712, + "step": 10075 + }, + { + "epoch": 5.943396226415095, + "grad_norm": 4.813990592956543, + "learning_rate": 8.862112806909365e-06, + "loss": 0.5066, + "num_input_tokens_seen": 6575688, + "step": 10080 + }, + { + "epoch": 5.946344339622642, + "grad_norm": 1.9035300016403198, + "learning_rate": 8.860478341952314e-06, + "loss": 0.5235, + "num_input_tokens_seen": 6578696, + "step": 10085 + }, + { + "epoch": 5.949292452830189, + "grad_norm": 2.4319918155670166, + "learning_rate": 8.858842854916985e-06, + "loss": 0.3276, + "num_input_tokens_seen": 6581096, + "step": 10090 + }, + { + "epoch": 5.9522405660377355, + "grad_norm": 2.5436289310455322, + "learning_rate": 8.85720634623638e-06, + "loss": 0.4287, + "num_input_tokens_seen": 6584712, + "step": 10095 + }, + { + "epoch": 5.955188679245283, + "grad_norm": 2.1763691902160645, + "learning_rate": 8.855568816343769e-06, + "loss": 0.4793, + "num_input_tokens_seen": 6587624, + "step": 10100 + }, + { + "epoch": 5.95813679245283, + "grad_norm": 1.643932819366455, + "learning_rate": 8.8539302656727e-06, + "loss": 0.462, + "num_input_tokens_seen": 6590568, + "step": 10105 + }, + { + "epoch": 5.961084905660377, + "grad_norm": 1.4726020097732544, + "learning_rate": 8.852290694656983e-06, + "loss": 0.5579, + "num_input_tokens_seen": 6593832, + "step": 10110 + }, + { + "epoch": 5.964033018867925, + "grad_norm": 1.864038348197937, + "learning_rate": 8.8506501037307e-06, + "loss": 0.4579, + "num_input_tokens_seen": 6596488, + "step": 10115 + }, + { + "epoch": 5.966981132075472, + "grad_norm": 2.398531436920166, + "learning_rate": 8.849008493328209e-06, + "loss": 0.3538, + "num_input_tokens_seen": 6599080, + "step": 10120 + }, + { + "epoch": 5.969929245283019, + "grad_norm": 1.2692980766296387, + "learning_rate": 8.847365863884131e-06, + "loss": 0.4391, + "num_input_tokens_seen": 6602088, + "step": 10125 + }, + { + "epoch": 5.972877358490566, + "grad_norm": 1.5619392395019531, + "learning_rate": 8.845722215833359e-06, + "loss": 0.3039, + "num_input_tokens_seen": 6604648, + "step": 10130 + }, + { + "epoch": 5.975825471698113, + "grad_norm": 2.0308432579040527, + "learning_rate": 8.844077549611056e-06, + "loss": 0.3713, + "num_input_tokens_seen": 6608552, + "step": 10135 + }, + { + "epoch": 5.97877358490566, + "grad_norm": 1.2307082414627075, + "learning_rate": 8.842431865652654e-06, + "loss": 0.4916, + "num_input_tokens_seen": 6612072, + "step": 10140 + }, + { + "epoch": 5.9817216981132075, + "grad_norm": 1.4244396686553955, + "learning_rate": 8.840785164393858e-06, + "loss": 0.451, + "num_input_tokens_seen": 6614440, + "step": 10145 + }, + { + "epoch": 5.984669811320755, + "grad_norm": 2.163041830062866, + "learning_rate": 8.839137446270634e-06, + "loss": 0.4549, + "num_input_tokens_seen": 6618024, + "step": 10150 + }, + { + "epoch": 5.987617924528302, + "grad_norm": 1.280468463897705, + "learning_rate": 8.837488711719226e-06, + "loss": 0.4564, + "num_input_tokens_seen": 6621192, + "step": 10155 + }, + { + "epoch": 5.990566037735849, + "grad_norm": 1.685910940170288, + "learning_rate": 8.835838961176143e-06, + "loss": 0.4378, + "num_input_tokens_seen": 6623592, + "step": 10160 + }, + { + "epoch": 5.993514150943396, + "grad_norm": 1.6796917915344238, + "learning_rate": 8.834188195078164e-06, + "loss": 0.3693, + "num_input_tokens_seen": 6626504, + "step": 10165 + }, + { + "epoch": 5.996462264150943, + "grad_norm": 3.3206424713134766, + "learning_rate": 8.832536413862337e-06, + "loss": 0.3843, + "num_input_tokens_seen": 6629800, + "step": 10170 + }, + { + "epoch": 5.99941037735849, + "grad_norm": 4.527689456939697, + "learning_rate": 8.830883617965976e-06, + "loss": 0.5516, + "num_input_tokens_seen": 6632680, + "step": 10175 + }, + { + "epoch": 6.0, + "eval_loss": 0.5020108222961426, + "eval_runtime": 19.1833, + "eval_samples_per_second": 88.41, + "eval_steps_per_second": 22.103, + "num_input_tokens_seen": 6632800, + "step": 10176 + }, + { + "epoch": 6.002358490566038, + "grad_norm": 4.684268951416016, + "learning_rate": 8.829229807826665e-06, + "loss": 0.3397, + "num_input_tokens_seen": 6634976, + "step": 10180 + }, + { + "epoch": 6.005306603773585, + "grad_norm": 1.6458688974380493, + "learning_rate": 8.82757498388226e-06, + "loss": 0.3627, + "num_input_tokens_seen": 6637536, + "step": 10185 + }, + { + "epoch": 6.008254716981132, + "grad_norm": 1.398048996925354, + "learning_rate": 8.825919146570884e-06, + "loss": 0.3804, + "num_input_tokens_seen": 6641152, + "step": 10190 + }, + { + "epoch": 6.0112028301886795, + "grad_norm": 1.6877312660217285, + "learning_rate": 8.824262296330925e-06, + "loss": 0.4702, + "num_input_tokens_seen": 6644064, + "step": 10195 + }, + { + "epoch": 6.014150943396227, + "grad_norm": 2.7180442810058594, + "learning_rate": 8.822604433601041e-06, + "loss": 0.4001, + "num_input_tokens_seen": 6647488, + "step": 10200 + }, + { + "epoch": 6.017099056603773, + "grad_norm": 2.2489688396453857, + "learning_rate": 8.820945558820158e-06, + "loss": 0.3117, + "num_input_tokens_seen": 6649792, + "step": 10205 + }, + { + "epoch": 6.0200471698113205, + "grad_norm": 2.4209139347076416, + "learning_rate": 8.81928567242747e-06, + "loss": 0.4939, + "num_input_tokens_seen": 6652672, + "step": 10210 + }, + { + "epoch": 6.022995283018868, + "grad_norm": 2.0638842582702637, + "learning_rate": 8.817624774862443e-06, + "loss": 0.5602, + "num_input_tokens_seen": 6656224, + "step": 10215 + }, + { + "epoch": 6.025943396226415, + "grad_norm": 2.3370678424835205, + "learning_rate": 8.815962866564803e-06, + "loss": 0.4647, + "num_input_tokens_seen": 6659840, + "step": 10220 + }, + { + "epoch": 6.028891509433962, + "grad_norm": 1.5156012773513794, + "learning_rate": 8.814299947974547e-06, + "loss": 0.4868, + "num_input_tokens_seen": 6663168, + "step": 10225 + }, + { + "epoch": 6.03183962264151, + "grad_norm": 3.3772552013397217, + "learning_rate": 8.812636019531942e-06, + "loss": 0.4054, + "num_input_tokens_seen": 6665792, + "step": 10230 + }, + { + "epoch": 6.034787735849057, + "grad_norm": 2.0655391216278076, + "learning_rate": 8.810971081677517e-06, + "loss": 0.3985, + "num_input_tokens_seen": 6668928, + "step": 10235 + }, + { + "epoch": 6.037735849056604, + "grad_norm": 1.8166823387145996, + "learning_rate": 8.809305134852076e-06, + "loss": 0.5117, + "num_input_tokens_seen": 6671616, + "step": 10240 + }, + { + "epoch": 6.040683962264151, + "grad_norm": 1.442678451538086, + "learning_rate": 8.807638179496684e-06, + "loss": 0.5803, + "num_input_tokens_seen": 6674912, + "step": 10245 + }, + { + "epoch": 6.043632075471698, + "grad_norm": 1.806138515472412, + "learning_rate": 8.805970216052673e-06, + "loss": 0.586, + "num_input_tokens_seen": 6679072, + "step": 10250 + }, + { + "epoch": 6.046580188679245, + "grad_norm": 2.371598243713379, + "learning_rate": 8.804301244961645e-06, + "loss": 0.4251, + "num_input_tokens_seen": 6681824, + "step": 10255 + }, + { + "epoch": 6.0495283018867925, + "grad_norm": 2.1468987464904785, + "learning_rate": 8.802631266665465e-06, + "loss": 0.3436, + "num_input_tokens_seen": 6685088, + "step": 10260 + }, + { + "epoch": 6.05247641509434, + "grad_norm": 1.8991566896438599, + "learning_rate": 8.80096028160627e-06, + "loss": 0.485, + "num_input_tokens_seen": 6687552, + "step": 10265 + }, + { + "epoch": 6.055424528301887, + "grad_norm": 2.3988687992095947, + "learning_rate": 8.799288290226457e-06, + "loss": 0.3963, + "num_input_tokens_seen": 6690560, + "step": 10270 + }, + { + "epoch": 6.058372641509434, + "grad_norm": 2.086108922958374, + "learning_rate": 8.797615292968698e-06, + "loss": 0.4803, + "num_input_tokens_seen": 6694432, + "step": 10275 + }, + { + "epoch": 6.061320754716981, + "grad_norm": 1.2736926078796387, + "learning_rate": 8.795941290275923e-06, + "loss": 0.3813, + "num_input_tokens_seen": 6697568, + "step": 10280 + }, + { + "epoch": 6.064268867924528, + "grad_norm": 1.2940442562103271, + "learning_rate": 8.79426628259133e-06, + "loss": 0.3476, + "num_input_tokens_seen": 6701184, + "step": 10285 + }, + { + "epoch": 6.067216981132075, + "grad_norm": 3.109651803970337, + "learning_rate": 8.792590270358389e-06, + "loss": 0.4239, + "num_input_tokens_seen": 6704160, + "step": 10290 + }, + { + "epoch": 6.070165094339623, + "grad_norm": 1.497114896774292, + "learning_rate": 8.790913254020827e-06, + "loss": 0.4336, + "num_input_tokens_seen": 6706432, + "step": 10295 + }, + { + "epoch": 6.07311320754717, + "grad_norm": 3.4340968132019043, + "learning_rate": 8.789235234022643e-06, + "loss": 0.4422, + "num_input_tokens_seen": 6709760, + "step": 10300 + }, + { + "epoch": 6.076061320754717, + "grad_norm": 1.7172421216964722, + "learning_rate": 8.787556210808101e-06, + "loss": 0.2895, + "num_input_tokens_seen": 6713248, + "step": 10305 + }, + { + "epoch": 6.0790094339622645, + "grad_norm": 2.9005744457244873, + "learning_rate": 8.78587618482173e-06, + "loss": 0.5111, + "num_input_tokens_seen": 6715904, + "step": 10310 + }, + { + "epoch": 6.081957547169812, + "grad_norm": 0.6793528199195862, + "learning_rate": 8.78419515650832e-06, + "loss": 0.3387, + "num_input_tokens_seen": 6721120, + "step": 10315 + }, + { + "epoch": 6.084905660377358, + "grad_norm": 1.7809995412826538, + "learning_rate": 8.782513126312934e-06, + "loss": 0.4124, + "num_input_tokens_seen": 6724160, + "step": 10320 + }, + { + "epoch": 6.087853773584905, + "grad_norm": 4.138539791107178, + "learning_rate": 8.780830094680897e-06, + "loss": 0.5492, + "num_input_tokens_seen": 6728096, + "step": 10325 + }, + { + "epoch": 6.090801886792453, + "grad_norm": 3.349318027496338, + "learning_rate": 8.779146062057797e-06, + "loss": 0.4293, + "num_input_tokens_seen": 6731392, + "step": 10330 + }, + { + "epoch": 6.09375, + "grad_norm": 2.0829672813415527, + "learning_rate": 8.777461028889492e-06, + "loss": 0.4462, + "num_input_tokens_seen": 6735776, + "step": 10335 + }, + { + "epoch": 6.096698113207547, + "grad_norm": 1.5729119777679443, + "learning_rate": 8.775774995622097e-06, + "loss": 0.3952, + "num_input_tokens_seen": 6738656, + "step": 10340 + }, + { + "epoch": 6.099646226415095, + "grad_norm": 3.1893270015716553, + "learning_rate": 8.774087962702e-06, + "loss": 0.5506, + "num_input_tokens_seen": 6741856, + "step": 10345 + }, + { + "epoch": 6.102594339622642, + "grad_norm": 1.5361682176589966, + "learning_rate": 8.772399930575849e-06, + "loss": 0.4465, + "num_input_tokens_seen": 6746240, + "step": 10350 + }, + { + "epoch": 6.105542452830188, + "grad_norm": 1.7695817947387695, + "learning_rate": 8.77071089969056e-06, + "loss": 0.7837, + "num_input_tokens_seen": 6749088, + "step": 10355 + }, + { + "epoch": 6.1084905660377355, + "grad_norm": 4.132552146911621, + "learning_rate": 8.769020870493309e-06, + "loss": 0.3996, + "num_input_tokens_seen": 6751392, + "step": 10360 + }, + { + "epoch": 6.111438679245283, + "grad_norm": 1.7339767217636108, + "learning_rate": 8.767329843431537e-06, + "loss": 0.4884, + "num_input_tokens_seen": 6755072, + "step": 10365 + }, + { + "epoch": 6.11438679245283, + "grad_norm": 2.132688522338867, + "learning_rate": 8.765637818952954e-06, + "loss": 0.4144, + "num_input_tokens_seen": 6757696, + "step": 10370 + }, + { + "epoch": 6.117334905660377, + "grad_norm": 1.8360496759414673, + "learning_rate": 8.76394479750553e-06, + "loss": 0.344, + "num_input_tokens_seen": 6760736, + "step": 10375 + }, + { + "epoch": 6.120283018867925, + "grad_norm": 1.6938674449920654, + "learning_rate": 8.762250779537499e-06, + "loss": 0.5142, + "num_input_tokens_seen": 6763680, + "step": 10380 + }, + { + "epoch": 6.123231132075472, + "grad_norm": 1.5598729848861694, + "learning_rate": 8.760555765497358e-06, + "loss": 0.3207, + "num_input_tokens_seen": 6766496, + "step": 10385 + }, + { + "epoch": 6.126179245283019, + "grad_norm": 1.6034488677978516, + "learning_rate": 8.758859755833873e-06, + "loss": 0.3245, + "num_input_tokens_seen": 6769280, + "step": 10390 + }, + { + "epoch": 6.129127358490566, + "grad_norm": 2.908996343612671, + "learning_rate": 8.757162750996066e-06, + "loss": 0.4854, + "num_input_tokens_seen": 6772064, + "step": 10395 + }, + { + "epoch": 6.132075471698113, + "grad_norm": 1.359206199645996, + "learning_rate": 8.755464751433229e-06, + "loss": 0.4088, + "num_input_tokens_seen": 6775200, + "step": 10400 + }, + { + "epoch": 6.13502358490566, + "grad_norm": 1.4671485424041748, + "learning_rate": 8.753765757594915e-06, + "loss": 0.4999, + "num_input_tokens_seen": 6778784, + "step": 10405 + }, + { + "epoch": 6.1379716981132075, + "grad_norm": 1.521746039390564, + "learning_rate": 8.752065769930938e-06, + "loss": 0.4165, + "num_input_tokens_seen": 6781536, + "step": 10410 + }, + { + "epoch": 6.140919811320755, + "grad_norm": 1.831125259399414, + "learning_rate": 8.75036478889138e-06, + "loss": 0.3385, + "num_input_tokens_seen": 6784064, + "step": 10415 + }, + { + "epoch": 6.143867924528302, + "grad_norm": 1.5671486854553223, + "learning_rate": 8.748662814926576e-06, + "loss": 0.4212, + "num_input_tokens_seen": 6788736, + "step": 10420 + }, + { + "epoch": 6.146816037735849, + "grad_norm": 1.3478660583496094, + "learning_rate": 8.746959848487139e-06, + "loss": 0.3795, + "num_input_tokens_seen": 6792256, + "step": 10425 + }, + { + "epoch": 6.149764150943396, + "grad_norm": 2.5194122791290283, + "learning_rate": 8.745255890023934e-06, + "loss": 0.3895, + "num_input_tokens_seen": 6795648, + "step": 10430 + }, + { + "epoch": 6.152712264150943, + "grad_norm": 1.3107904195785522, + "learning_rate": 8.74355093998809e-06, + "loss": 0.3765, + "num_input_tokens_seen": 6798784, + "step": 10435 + }, + { + "epoch": 6.15566037735849, + "grad_norm": 1.340149164199829, + "learning_rate": 8.741844998831001e-06, + "loss": 0.3448, + "num_input_tokens_seen": 6802432, + "step": 10440 + }, + { + "epoch": 6.158608490566038, + "grad_norm": 1.7845377922058105, + "learning_rate": 8.740138067004323e-06, + "loss": 0.4029, + "num_input_tokens_seen": 6805312, + "step": 10445 + }, + { + "epoch": 6.161556603773585, + "grad_norm": 3.7810888290405273, + "learning_rate": 8.738430144959973e-06, + "loss": 0.4001, + "num_input_tokens_seen": 6808640, + "step": 10450 + }, + { + "epoch": 6.164504716981132, + "grad_norm": 1.5898064374923706, + "learning_rate": 8.73672123315013e-06, + "loss": 0.4271, + "num_input_tokens_seen": 6811328, + "step": 10455 + }, + { + "epoch": 6.1674528301886795, + "grad_norm": 1.4934734106063843, + "learning_rate": 8.735011332027234e-06, + "loss": 0.411, + "num_input_tokens_seen": 6814400, + "step": 10460 + }, + { + "epoch": 6.170400943396227, + "grad_norm": 3.3223869800567627, + "learning_rate": 8.733300442043993e-06, + "loss": 0.5188, + "num_input_tokens_seen": 6817312, + "step": 10465 + }, + { + "epoch": 6.173349056603773, + "grad_norm": 2.574805974960327, + "learning_rate": 8.73158856365337e-06, + "loss": 0.6431, + "num_input_tokens_seen": 6820192, + "step": 10470 + }, + { + "epoch": 6.1762971698113205, + "grad_norm": 4.194033622741699, + "learning_rate": 8.729875697308592e-06, + "loss": 0.3565, + "num_input_tokens_seen": 6823264, + "step": 10475 + }, + { + "epoch": 6.179245283018868, + "grad_norm": 4.226123332977295, + "learning_rate": 8.728161843463148e-06, + "loss": 0.4331, + "num_input_tokens_seen": 6825888, + "step": 10480 + }, + { + "epoch": 6.182193396226415, + "grad_norm": 3.0703721046447754, + "learning_rate": 8.726447002570791e-06, + "loss": 0.6685, + "num_input_tokens_seen": 6829760, + "step": 10485 + }, + { + "epoch": 6.185141509433962, + "grad_norm": 1.5446996688842773, + "learning_rate": 8.724731175085526e-06, + "loss": 0.4639, + "num_input_tokens_seen": 6832416, + "step": 10490 + }, + { + "epoch": 6.18808962264151, + "grad_norm": 1.5521007776260376, + "learning_rate": 8.723014361461633e-06, + "loss": 0.3098, + "num_input_tokens_seen": 6835776, + "step": 10495 + }, + { + "epoch": 6.191037735849057, + "grad_norm": 0.749037504196167, + "learning_rate": 8.72129656215364e-06, + "loss": 0.406, + "num_input_tokens_seen": 6839328, + "step": 10500 + }, + { + "epoch": 6.193985849056604, + "grad_norm": 1.5499939918518066, + "learning_rate": 8.719577777616347e-06, + "loss": 0.3585, + "num_input_tokens_seen": 6841888, + "step": 10505 + }, + { + "epoch": 6.196933962264151, + "grad_norm": 2.25122332572937, + "learning_rate": 8.717858008304804e-06, + "loss": 0.409, + "num_input_tokens_seen": 6845056, + "step": 10510 + }, + { + "epoch": 6.199882075471698, + "grad_norm": 1.3916805982589722, + "learning_rate": 8.71613725467433e-06, + "loss": 0.4572, + "num_input_tokens_seen": 6847872, + "step": 10515 + }, + { + "epoch": 6.202830188679245, + "grad_norm": 2.3830838203430176, + "learning_rate": 8.714415517180506e-06, + "loss": 0.3304, + "num_input_tokens_seen": 6851776, + "step": 10520 + }, + { + "epoch": 6.2057783018867925, + "grad_norm": 2.5637621879577637, + "learning_rate": 8.712692796279164e-06, + "loss": 0.38, + "num_input_tokens_seen": 6856832, + "step": 10525 + }, + { + "epoch": 6.20872641509434, + "grad_norm": 1.9720937013626099, + "learning_rate": 8.710969092426401e-06, + "loss": 0.3287, + "num_input_tokens_seen": 6859296, + "step": 10530 + }, + { + "epoch": 6.211674528301887, + "grad_norm": 2.9785029888153076, + "learning_rate": 8.70924440607858e-06, + "loss": 0.5329, + "num_input_tokens_seen": 6862304, + "step": 10535 + }, + { + "epoch": 6.214622641509434, + "grad_norm": 2.3837740421295166, + "learning_rate": 8.707518737692315e-06, + "loss": 0.4738, + "num_input_tokens_seen": 6865248, + "step": 10540 + }, + { + "epoch": 6.217570754716981, + "grad_norm": 1.1246404647827148, + "learning_rate": 8.705792087724485e-06, + "loss": 0.2997, + "num_input_tokens_seen": 6868032, + "step": 10545 + }, + { + "epoch": 6.220518867924528, + "grad_norm": 1.95211923122406, + "learning_rate": 8.704064456632231e-06, + "loss": 0.4927, + "num_input_tokens_seen": 6871936, + "step": 10550 + }, + { + "epoch": 6.223466981132075, + "grad_norm": 1.6259949207305908, + "learning_rate": 8.702335844872946e-06, + "loss": 0.3798, + "num_input_tokens_seen": 6874720, + "step": 10555 + }, + { + "epoch": 6.226415094339623, + "grad_norm": 1.6640855073928833, + "learning_rate": 8.700606252904293e-06, + "loss": 0.4181, + "num_input_tokens_seen": 6877984, + "step": 10560 + }, + { + "epoch": 6.22936320754717, + "grad_norm": 2.951138734817505, + "learning_rate": 8.698875681184183e-06, + "loss": 0.4209, + "num_input_tokens_seen": 6881504, + "step": 10565 + }, + { + "epoch": 6.232311320754717, + "grad_norm": 2.4996767044067383, + "learning_rate": 8.697144130170797e-06, + "loss": 0.4598, + "num_input_tokens_seen": 6884832, + "step": 10570 + }, + { + "epoch": 6.2352594339622645, + "grad_norm": 1.5430487394332886, + "learning_rate": 8.695411600322568e-06, + "loss": 0.4888, + "num_input_tokens_seen": 6887648, + "step": 10575 + }, + { + "epoch": 6.238207547169812, + "grad_norm": 1.2646154165267944, + "learning_rate": 8.693678092098191e-06, + "loss": 0.3685, + "num_input_tokens_seen": 6890528, + "step": 10580 + }, + { + "epoch": 6.241155660377358, + "grad_norm": 1.075602412223816, + "learning_rate": 8.691943605956621e-06, + "loss": 0.4171, + "num_input_tokens_seen": 6894112, + "step": 10585 + }, + { + "epoch": 6.244103773584905, + "grad_norm": 1.4442609548568726, + "learning_rate": 8.690208142357069e-06, + "loss": 0.4101, + "num_input_tokens_seen": 6897440, + "step": 10590 + }, + { + "epoch": 6.247051886792453, + "grad_norm": 1.982790470123291, + "learning_rate": 8.68847170175901e-06, + "loss": 0.3522, + "num_input_tokens_seen": 6901088, + "step": 10595 + }, + { + "epoch": 6.25, + "grad_norm": 1.2460302114486694, + "learning_rate": 8.686734284622168e-06, + "loss": 0.3493, + "num_input_tokens_seen": 6903712, + "step": 10600 + }, + { + "epoch": 6.252948113207547, + "grad_norm": 3.115110397338867, + "learning_rate": 8.684995891406537e-06, + "loss": 0.3964, + "num_input_tokens_seen": 6906912, + "step": 10605 + }, + { + "epoch": 6.255896226415095, + "grad_norm": 2.1191303730010986, + "learning_rate": 8.683256522572362e-06, + "loss": 0.5167, + "num_input_tokens_seen": 6909536, + "step": 10610 + }, + { + "epoch": 6.258844339622642, + "grad_norm": 3.4223861694335938, + "learning_rate": 8.68151617858015e-06, + "loss": 0.3299, + "num_input_tokens_seen": 6912320, + "step": 10615 + }, + { + "epoch": 6.261792452830189, + "grad_norm": 1.2667052745819092, + "learning_rate": 8.67977485989066e-06, + "loss": 0.4186, + "num_input_tokens_seen": 6915456, + "step": 10620 + }, + { + "epoch": 6.2647405660377355, + "grad_norm": 3.6430094242095947, + "learning_rate": 8.67803256696492e-06, + "loss": 0.467, + "num_input_tokens_seen": 6919328, + "step": 10625 + }, + { + "epoch": 6.267688679245283, + "grad_norm": 1.7098509073257446, + "learning_rate": 8.676289300264205e-06, + "loss": 0.4627, + "num_input_tokens_seen": 6923104, + "step": 10630 + }, + { + "epoch": 6.27063679245283, + "grad_norm": 2.6843483448028564, + "learning_rate": 8.674545060250054e-06, + "loss": 0.4379, + "num_input_tokens_seen": 6926048, + "step": 10635 + }, + { + "epoch": 6.273584905660377, + "grad_norm": 2.3614320755004883, + "learning_rate": 8.672799847384263e-06, + "loss": 0.3809, + "num_input_tokens_seen": 6929792, + "step": 10640 + }, + { + "epoch": 6.276533018867925, + "grad_norm": 1.6271038055419922, + "learning_rate": 8.671053662128883e-06, + "loss": 0.4686, + "num_input_tokens_seen": 6932736, + "step": 10645 + }, + { + "epoch": 6.279481132075472, + "grad_norm": 1.1814461946487427, + "learning_rate": 8.669306504946223e-06, + "loss": 0.4657, + "num_input_tokens_seen": 6936032, + "step": 10650 + }, + { + "epoch": 6.282429245283019, + "grad_norm": 4.465968608856201, + "learning_rate": 8.667558376298854e-06, + "loss": 0.4659, + "num_input_tokens_seen": 6939136, + "step": 10655 + }, + { + "epoch": 6.285377358490566, + "grad_norm": 2.177250862121582, + "learning_rate": 8.665809276649597e-06, + "loss": 0.3539, + "num_input_tokens_seen": 6942656, + "step": 10660 + }, + { + "epoch": 6.288325471698113, + "grad_norm": 2.0434207916259766, + "learning_rate": 8.664059206461537e-06, + "loss": 0.5863, + "num_input_tokens_seen": 6945376, + "step": 10665 + }, + { + "epoch": 6.29127358490566, + "grad_norm": 2.3921236991882324, + "learning_rate": 8.662308166198009e-06, + "loss": 0.4993, + "num_input_tokens_seen": 6948448, + "step": 10670 + }, + { + "epoch": 6.2942216981132075, + "grad_norm": 5.244947910308838, + "learning_rate": 8.660556156322611e-06, + "loss": 0.4374, + "num_input_tokens_seen": 6951008, + "step": 10675 + }, + { + "epoch": 6.297169811320755, + "grad_norm": 2.308397054672241, + "learning_rate": 8.658803177299196e-06, + "loss": 0.319, + "num_input_tokens_seen": 6954400, + "step": 10680 + }, + { + "epoch": 6.300117924528302, + "grad_norm": 2.06506609916687, + "learning_rate": 8.65704922959187e-06, + "loss": 0.4351, + "num_input_tokens_seen": 6959168, + "step": 10685 + }, + { + "epoch": 6.303066037735849, + "grad_norm": 1.74628746509552, + "learning_rate": 8.655294313664998e-06, + "loss": 0.4923, + "num_input_tokens_seen": 6963456, + "step": 10690 + }, + { + "epoch": 6.306014150943396, + "grad_norm": 1.2609224319458008, + "learning_rate": 8.653538429983204e-06, + "loss": 0.381, + "num_input_tokens_seen": 6967232, + "step": 10695 + }, + { + "epoch": 6.308962264150943, + "grad_norm": 2.5142407417297363, + "learning_rate": 8.651781579011366e-06, + "loss": 0.3356, + "num_input_tokens_seen": 6970336, + "step": 10700 + }, + { + "epoch": 6.31191037735849, + "grad_norm": 1.3669636249542236, + "learning_rate": 8.650023761214615e-06, + "loss": 0.474, + "num_input_tokens_seen": 6973184, + "step": 10705 + }, + { + "epoch": 6.314858490566038, + "grad_norm": 2.93194842338562, + "learning_rate": 8.648264977058344e-06, + "loss": 0.5343, + "num_input_tokens_seen": 6976480, + "step": 10710 + }, + { + "epoch": 6.317806603773585, + "grad_norm": 2.2444570064544678, + "learning_rate": 8.646505227008197e-06, + "loss": 0.5249, + "num_input_tokens_seen": 6980032, + "step": 10715 + }, + { + "epoch": 6.320754716981132, + "grad_norm": 2.386554002761841, + "learning_rate": 8.644744511530074e-06, + "loss": 0.4217, + "num_input_tokens_seen": 6983296, + "step": 10720 + }, + { + "epoch": 6.3237028301886795, + "grad_norm": 1.8733233213424683, + "learning_rate": 8.642982831090135e-06, + "loss": 0.5654, + "num_input_tokens_seen": 6986784, + "step": 10725 + }, + { + "epoch": 6.326650943396227, + "grad_norm": 1.7614340782165527, + "learning_rate": 8.64122018615479e-06, + "loss": 0.5456, + "num_input_tokens_seen": 6991648, + "step": 10730 + }, + { + "epoch": 6.329599056603773, + "grad_norm": 1.538223385810852, + "learning_rate": 8.63945657719071e-06, + "loss": 0.3512, + "num_input_tokens_seen": 6995104, + "step": 10735 + }, + { + "epoch": 6.3325471698113205, + "grad_norm": 1.4793990850448608, + "learning_rate": 8.637692004664816e-06, + "loss": 0.3562, + "num_input_tokens_seen": 6997504, + "step": 10740 + }, + { + "epoch": 6.335495283018868, + "grad_norm": 2.9595794677734375, + "learning_rate": 8.635926469044284e-06, + "loss": 0.5171, + "num_input_tokens_seen": 7001312, + "step": 10745 + }, + { + "epoch": 6.338443396226415, + "grad_norm": 2.223149299621582, + "learning_rate": 8.63415997079655e-06, + "loss": 0.352, + "num_input_tokens_seen": 7004032, + "step": 10750 + }, + { + "epoch": 6.341391509433962, + "grad_norm": 2.6209397315979004, + "learning_rate": 8.6323925103893e-06, + "loss": 0.436, + "num_input_tokens_seen": 7006848, + "step": 10755 + }, + { + "epoch": 6.34433962264151, + "grad_norm": 3.91560697555542, + "learning_rate": 8.63062408829048e-06, + "loss": 0.3455, + "num_input_tokens_seen": 7009600, + "step": 10760 + }, + { + "epoch": 6.347287735849057, + "grad_norm": 2.4054553508758545, + "learning_rate": 8.628854704968285e-06, + "loss": 0.3871, + "num_input_tokens_seen": 7011904, + "step": 10765 + }, + { + "epoch": 6.350235849056604, + "grad_norm": 3.0327184200286865, + "learning_rate": 8.62708436089117e-06, + "loss": 0.3823, + "num_input_tokens_seen": 7015168, + "step": 10770 + }, + { + "epoch": 6.353183962264151, + "grad_norm": 3.2022640705108643, + "learning_rate": 8.625313056527836e-06, + "loss": 0.3832, + "num_input_tokens_seen": 7017632, + "step": 10775 + }, + { + "epoch": 6.356132075471698, + "grad_norm": 1.177520751953125, + "learning_rate": 8.623540792347244e-06, + "loss": 0.3014, + "num_input_tokens_seen": 7022304, + "step": 10780 + }, + { + "epoch": 6.359080188679245, + "grad_norm": 2.0489003658294678, + "learning_rate": 8.621767568818614e-06, + "loss": 0.5369, + "num_input_tokens_seen": 7025280, + "step": 10785 + }, + { + "epoch": 6.3620283018867925, + "grad_norm": 1.0781079530715942, + "learning_rate": 8.619993386411409e-06, + "loss": 0.5328, + "num_input_tokens_seen": 7029216, + "step": 10790 + }, + { + "epoch": 6.36497641509434, + "grad_norm": 1.5288147926330566, + "learning_rate": 8.618218245595356e-06, + "loss": 0.4641, + "num_input_tokens_seen": 7032192, + "step": 10795 + }, + { + "epoch": 6.367924528301887, + "grad_norm": 1.3783318996429443, + "learning_rate": 8.616442146840427e-06, + "loss": 0.477, + "num_input_tokens_seen": 7035072, + "step": 10800 + }, + { + "epoch": 6.370872641509434, + "grad_norm": 2.512392520904541, + "learning_rate": 8.614665090616854e-06, + "loss": 0.4155, + "num_input_tokens_seen": 7039072, + "step": 10805 + }, + { + "epoch": 6.373820754716981, + "grad_norm": 2.566673994064331, + "learning_rate": 8.61288707739512e-06, + "loss": 0.4178, + "num_input_tokens_seen": 7043584, + "step": 10810 + }, + { + "epoch": 6.376768867924528, + "grad_norm": 1.7683953046798706, + "learning_rate": 8.611108107645963e-06, + "loss": 0.4746, + "num_input_tokens_seen": 7047264, + "step": 10815 + }, + { + "epoch": 6.379716981132075, + "grad_norm": 1.623326063156128, + "learning_rate": 8.609328181840368e-06, + "loss": 0.3146, + "num_input_tokens_seen": 7050528, + "step": 10820 + }, + { + "epoch": 6.382665094339623, + "grad_norm": 1.8695857524871826, + "learning_rate": 8.607547300449585e-06, + "loss": 0.4013, + "num_input_tokens_seen": 7053920, + "step": 10825 + }, + { + "epoch": 6.38561320754717, + "grad_norm": 2.623251438140869, + "learning_rate": 8.605765463945105e-06, + "loss": 0.444, + "num_input_tokens_seen": 7056736, + "step": 10830 + }, + { + "epoch": 6.388561320754717, + "grad_norm": 1.643649935722351, + "learning_rate": 8.603982672798678e-06, + "loss": 0.3847, + "num_input_tokens_seen": 7059616, + "step": 10835 + }, + { + "epoch": 6.3915094339622645, + "grad_norm": 1.6045095920562744, + "learning_rate": 8.602198927482309e-06, + "loss": 0.4994, + "num_input_tokens_seen": 7063424, + "step": 10840 + }, + { + "epoch": 6.394457547169811, + "grad_norm": 3.3581929206848145, + "learning_rate": 8.600414228468245e-06, + "loss": 0.3366, + "num_input_tokens_seen": 7067520, + "step": 10845 + }, + { + "epoch": 6.397405660377358, + "grad_norm": 2.165945291519165, + "learning_rate": 8.598628576229e-06, + "loss": 0.4935, + "num_input_tokens_seen": 7070720, + "step": 10850 + }, + { + "epoch": 6.400353773584905, + "grad_norm": 1.7261625528335571, + "learning_rate": 8.596841971237328e-06, + "loss": 0.3289, + "num_input_tokens_seen": 7074656, + "step": 10855 + }, + { + "epoch": 6.403301886792453, + "grad_norm": 1.53758704662323, + "learning_rate": 8.595054413966246e-06, + "loss": 0.4152, + "num_input_tokens_seen": 7077088, + "step": 10860 + }, + { + "epoch": 6.40625, + "grad_norm": 3.110621452331543, + "learning_rate": 8.593265904889011e-06, + "loss": 0.4701, + "num_input_tokens_seen": 7080000, + "step": 10865 + }, + { + "epoch": 6.409198113207547, + "grad_norm": 1.217392921447754, + "learning_rate": 8.591476444479141e-06, + "loss": 0.3213, + "num_input_tokens_seen": 7083072, + "step": 10870 + }, + { + "epoch": 6.412146226415095, + "grad_norm": 1.526443362236023, + "learning_rate": 8.589686033210407e-06, + "loss": 0.4516, + "num_input_tokens_seen": 7085600, + "step": 10875 + }, + { + "epoch": 6.415094339622642, + "grad_norm": 1.4059326648712158, + "learning_rate": 8.587894671556823e-06, + "loss": 0.4884, + "num_input_tokens_seen": 7088960, + "step": 10880 + }, + { + "epoch": 6.418042452830189, + "grad_norm": 3.5297603607177734, + "learning_rate": 8.586102359992663e-06, + "loss": 0.4484, + "num_input_tokens_seen": 7092448, + "step": 10885 + }, + { + "epoch": 6.4209905660377355, + "grad_norm": 2.4462599754333496, + "learning_rate": 8.584309098992447e-06, + "loss": 0.3753, + "num_input_tokens_seen": 7094624, + "step": 10890 + }, + { + "epoch": 6.423938679245283, + "grad_norm": 2.759803533554077, + "learning_rate": 8.58251488903095e-06, + "loss": 0.3845, + "num_input_tokens_seen": 7097120, + "step": 10895 + }, + { + "epoch": 6.42688679245283, + "grad_norm": 1.5519237518310547, + "learning_rate": 8.580719730583196e-06, + "loss": 0.3401, + "num_input_tokens_seen": 7099616, + "step": 10900 + }, + { + "epoch": 6.429834905660377, + "grad_norm": 1.7532367706298828, + "learning_rate": 8.578923624124462e-06, + "loss": 0.388, + "num_input_tokens_seen": 7102592, + "step": 10905 + }, + { + "epoch": 6.432783018867925, + "grad_norm": 3.7045364379882812, + "learning_rate": 8.577126570130273e-06, + "loss": 0.4829, + "num_input_tokens_seen": 7105024, + "step": 10910 + }, + { + "epoch": 6.435731132075472, + "grad_norm": 1.9576891660690308, + "learning_rate": 8.575328569076408e-06, + "loss": 0.3273, + "num_input_tokens_seen": 7109056, + "step": 10915 + }, + { + "epoch": 6.438679245283019, + "grad_norm": 3.592815399169922, + "learning_rate": 8.573529621438896e-06, + "loss": 0.4052, + "num_input_tokens_seen": 7114080, + "step": 10920 + }, + { + "epoch": 6.441627358490566, + "grad_norm": 2.876549005508423, + "learning_rate": 8.571729727694015e-06, + "loss": 0.437, + "num_input_tokens_seen": 7118016, + "step": 10925 + }, + { + "epoch": 6.444575471698113, + "grad_norm": 2.1755335330963135, + "learning_rate": 8.569928888318298e-06, + "loss": 0.4822, + "num_input_tokens_seen": 7120288, + "step": 10930 + }, + { + "epoch": 6.44752358490566, + "grad_norm": 3.2360498905181885, + "learning_rate": 8.56812710378852e-06, + "loss": 0.5622, + "num_input_tokens_seen": 7123200, + "step": 10935 + }, + { + "epoch": 6.4504716981132075, + "grad_norm": 3.8863954544067383, + "learning_rate": 8.566324374581714e-06, + "loss": 0.3988, + "num_input_tokens_seen": 7126336, + "step": 10940 + }, + { + "epoch": 6.453419811320755, + "grad_norm": 1.6692085266113281, + "learning_rate": 8.564520701175158e-06, + "loss": 0.4559, + "num_input_tokens_seen": 7129312, + "step": 10945 + }, + { + "epoch": 6.456367924528302, + "grad_norm": 3.6473913192749023, + "learning_rate": 8.562716084046387e-06, + "loss": 0.5083, + "num_input_tokens_seen": 7133088, + "step": 10950 + }, + { + "epoch": 6.459316037735849, + "grad_norm": 3.0011744499206543, + "learning_rate": 8.560910523673177e-06, + "loss": 0.4521, + "num_input_tokens_seen": 7137248, + "step": 10955 + }, + { + "epoch": 6.462264150943396, + "grad_norm": 1.306213140487671, + "learning_rate": 8.55910402053356e-06, + "loss": 0.2731, + "num_input_tokens_seen": 7141312, + "step": 10960 + }, + { + "epoch": 6.465212264150943, + "grad_norm": 1.998231291770935, + "learning_rate": 8.557296575105814e-06, + "loss": 0.5163, + "num_input_tokens_seen": 7145312, + "step": 10965 + }, + { + "epoch": 6.46816037735849, + "grad_norm": 2.068466901779175, + "learning_rate": 8.555488187868469e-06, + "loss": 0.3538, + "num_input_tokens_seen": 7147840, + "step": 10970 + }, + { + "epoch": 6.471108490566038, + "grad_norm": 2.4957549571990967, + "learning_rate": 8.5536788593003e-06, + "loss": 0.3487, + "num_input_tokens_seen": 7152256, + "step": 10975 + }, + { + "epoch": 6.474056603773585, + "grad_norm": 1.5608731508255005, + "learning_rate": 8.55186858988034e-06, + "loss": 0.4033, + "num_input_tokens_seen": 7154976, + "step": 10980 + }, + { + "epoch": 6.477004716981132, + "grad_norm": 1.6199347972869873, + "learning_rate": 8.550057380087863e-06, + "loss": 0.525, + "num_input_tokens_seen": 7158112, + "step": 10985 + }, + { + "epoch": 6.4799528301886795, + "grad_norm": 4.086792945861816, + "learning_rate": 8.548245230402396e-06, + "loss": 0.4632, + "num_input_tokens_seen": 7162240, + "step": 10990 + }, + { + "epoch": 6.482900943396227, + "grad_norm": 2.2521724700927734, + "learning_rate": 8.546432141303711e-06, + "loss": 0.4392, + "num_input_tokens_seen": 7166880, + "step": 10995 + }, + { + "epoch": 6.485849056603773, + "grad_norm": 1.2129896879196167, + "learning_rate": 8.544618113271833e-06, + "loss": 0.3647, + "num_input_tokens_seen": 7170496, + "step": 11000 + }, + { + "epoch": 6.4887971698113205, + "grad_norm": 1.4614900350570679, + "learning_rate": 8.542803146787032e-06, + "loss": 0.5893, + "num_input_tokens_seen": 7174144, + "step": 11005 + }, + { + "epoch": 6.491745283018868, + "grad_norm": 1.163750171661377, + "learning_rate": 8.54098724232983e-06, + "loss": 0.4448, + "num_input_tokens_seen": 7176992, + "step": 11010 + }, + { + "epoch": 6.494693396226415, + "grad_norm": 2.3187289237976074, + "learning_rate": 8.539170400380994e-06, + "loss": 0.531, + "num_input_tokens_seen": 7179328, + "step": 11015 + }, + { + "epoch": 6.497641509433962, + "grad_norm": 1.4366041421890259, + "learning_rate": 8.537352621421542e-06, + "loss": 0.4361, + "num_input_tokens_seen": 7181952, + "step": 11020 + }, + { + "epoch": 6.50058962264151, + "grad_norm": 1.4318405389785767, + "learning_rate": 8.535533905932739e-06, + "loss": 0.4972, + "num_input_tokens_seen": 7185536, + "step": 11025 + }, + { + "epoch": 6.503537735849057, + "grad_norm": 2.680626153945923, + "learning_rate": 8.533714254396096e-06, + "loss": 0.4412, + "num_input_tokens_seen": 7188288, + "step": 11030 + }, + { + "epoch": 6.506485849056604, + "grad_norm": 2.201526641845703, + "learning_rate": 8.531893667293375e-06, + "loss": 0.4875, + "num_input_tokens_seen": 7192128, + "step": 11035 + }, + { + "epoch": 6.509433962264151, + "grad_norm": 1.1782864332199097, + "learning_rate": 8.530072145106585e-06, + "loss": 0.3171, + "num_input_tokens_seen": 7195104, + "step": 11040 + }, + { + "epoch": 6.512382075471698, + "grad_norm": 2.1387217044830322, + "learning_rate": 8.528249688317978e-06, + "loss": 0.3848, + "num_input_tokens_seen": 7197952, + "step": 11045 + }, + { + "epoch": 6.515330188679245, + "grad_norm": 3.1230356693267822, + "learning_rate": 8.526426297410062e-06, + "loss": 0.3133, + "num_input_tokens_seen": 7201216, + "step": 11050 + }, + { + "epoch": 6.5182783018867925, + "grad_norm": 1.4122166633605957, + "learning_rate": 8.524601972865586e-06, + "loss": 0.402, + "num_input_tokens_seen": 7207520, + "step": 11055 + }, + { + "epoch": 6.52122641509434, + "grad_norm": 1.490643858909607, + "learning_rate": 8.522776715167548e-06, + "loss": 0.4625, + "num_input_tokens_seen": 7211712, + "step": 11060 + }, + { + "epoch": 6.524174528301887, + "grad_norm": 2.2613625526428223, + "learning_rate": 8.520950524799192e-06, + "loss": 0.2418, + "num_input_tokens_seen": 7214112, + "step": 11065 + }, + { + "epoch": 6.527122641509434, + "grad_norm": 2.9873292446136475, + "learning_rate": 8.51912340224401e-06, + "loss": 0.5313, + "num_input_tokens_seen": 7219616, + "step": 11070 + }, + { + "epoch": 6.530070754716981, + "grad_norm": 3.4120967388153076, + "learning_rate": 8.51729534798574e-06, + "loss": 0.3658, + "num_input_tokens_seen": 7222240, + "step": 11075 + }, + { + "epoch": 6.533018867924528, + "grad_norm": 2.680600881576538, + "learning_rate": 8.515466362508369e-06, + "loss": 0.4608, + "num_input_tokens_seen": 7224832, + "step": 11080 + }, + { + "epoch": 6.535966981132075, + "grad_norm": 1.951891303062439, + "learning_rate": 8.513636446296125e-06, + "loss": 0.3542, + "num_input_tokens_seen": 7227648, + "step": 11085 + }, + { + "epoch": 6.538915094339623, + "grad_norm": 1.6225849390029907, + "learning_rate": 8.51180559983349e-06, + "loss": 0.4697, + "num_input_tokens_seen": 7231328, + "step": 11090 + }, + { + "epoch": 6.54186320754717, + "grad_norm": 2.1897928714752197, + "learning_rate": 8.50997382360519e-06, + "loss": 0.4618, + "num_input_tokens_seen": 7234592, + "step": 11095 + }, + { + "epoch": 6.544811320754717, + "grad_norm": 2.547898054122925, + "learning_rate": 8.508141118096191e-06, + "loss": 0.5463, + "num_input_tokens_seen": 7237600, + "step": 11100 + }, + { + "epoch": 6.5477594339622645, + "grad_norm": 1.795732021331787, + "learning_rate": 8.506307483791712e-06, + "loss": 0.3954, + "num_input_tokens_seen": 7240416, + "step": 11105 + }, + { + "epoch": 6.550707547169811, + "grad_norm": 1.790718913078308, + "learning_rate": 8.504472921177215e-06, + "loss": 0.3508, + "num_input_tokens_seen": 7243104, + "step": 11110 + }, + { + "epoch": 6.553655660377358, + "grad_norm": 3.0685505867004395, + "learning_rate": 8.502637430738409e-06, + "loss": 0.5178, + "num_input_tokens_seen": 7246336, + "step": 11115 + }, + { + "epoch": 6.556603773584905, + "grad_norm": 2.019568681716919, + "learning_rate": 8.500801012961248e-06, + "loss": 0.4293, + "num_input_tokens_seen": 7249152, + "step": 11120 + }, + { + "epoch": 6.559551886792453, + "grad_norm": 1.8914834260940552, + "learning_rate": 8.49896366833193e-06, + "loss": 0.4648, + "num_input_tokens_seen": 7252192, + "step": 11125 + }, + { + "epoch": 6.5625, + "grad_norm": 1.3950716257095337, + "learning_rate": 8.497125397336903e-06, + "loss": 0.3835, + "num_input_tokens_seen": 7255232, + "step": 11130 + }, + { + "epoch": 6.565448113207547, + "grad_norm": 4.4465250968933105, + "learning_rate": 8.495286200462854e-06, + "loss": 0.4251, + "num_input_tokens_seen": 7258016, + "step": 11135 + }, + { + "epoch": 6.568396226415095, + "grad_norm": 1.8653334379196167, + "learning_rate": 8.49344607819672e-06, + "loss": 0.3453, + "num_input_tokens_seen": 7260384, + "step": 11140 + }, + { + "epoch": 6.571344339622642, + "grad_norm": 2.5832009315490723, + "learning_rate": 8.49160503102568e-06, + "loss": 0.5215, + "num_input_tokens_seen": 7263264, + "step": 11145 + }, + { + "epoch": 6.574292452830189, + "grad_norm": 2.406696319580078, + "learning_rate": 8.489763059437161e-06, + "loss": 0.4045, + "num_input_tokens_seen": 7266464, + "step": 11150 + }, + { + "epoch": 6.5772405660377355, + "grad_norm": 2.7078311443328857, + "learning_rate": 8.487920163918833e-06, + "loss": 0.3774, + "num_input_tokens_seen": 7269472, + "step": 11155 + }, + { + "epoch": 6.580188679245283, + "grad_norm": 1.8479723930358887, + "learning_rate": 8.486076344958607e-06, + "loss": 0.4179, + "num_input_tokens_seen": 7273632, + "step": 11160 + }, + { + "epoch": 6.58313679245283, + "grad_norm": 4.485570907592773, + "learning_rate": 8.484231603044647e-06, + "loss": 0.4313, + "num_input_tokens_seen": 7277344, + "step": 11165 + }, + { + "epoch": 6.586084905660377, + "grad_norm": 2.016378402709961, + "learning_rate": 8.482385938665352e-06, + "loss": 0.283, + "num_input_tokens_seen": 7280128, + "step": 11170 + }, + { + "epoch": 6.589033018867925, + "grad_norm": 1.260894536972046, + "learning_rate": 8.480539352309373e-06, + "loss": 0.4056, + "num_input_tokens_seen": 7283648, + "step": 11175 + }, + { + "epoch": 6.591981132075472, + "grad_norm": 3.133361339569092, + "learning_rate": 8.478691844465598e-06, + "loss": 0.4323, + "num_input_tokens_seen": 7286272, + "step": 11180 + }, + { + "epoch": 6.594929245283019, + "grad_norm": 4.142938137054443, + "learning_rate": 8.476843415623168e-06, + "loss": 0.3752, + "num_input_tokens_seen": 7289568, + "step": 11185 + }, + { + "epoch": 6.597877358490566, + "grad_norm": 2.6723999977111816, + "learning_rate": 8.474994066271458e-06, + "loss": 0.5469, + "num_input_tokens_seen": 7293696, + "step": 11190 + }, + { + "epoch": 6.600825471698113, + "grad_norm": 3.1884076595306396, + "learning_rate": 8.473143796900089e-06, + "loss": 0.5029, + "num_input_tokens_seen": 7296160, + "step": 11195 + }, + { + "epoch": 6.60377358490566, + "grad_norm": 3.000969171524048, + "learning_rate": 8.471292607998936e-06, + "loss": 0.5805, + "num_input_tokens_seen": 7299936, + "step": 11200 + }, + { + "epoch": 6.6067216981132075, + "grad_norm": 1.8377585411071777, + "learning_rate": 8.469440500058104e-06, + "loss": 0.4418, + "num_input_tokens_seen": 7305664, + "step": 11205 + }, + { + "epoch": 6.609669811320755, + "grad_norm": 2.339547872543335, + "learning_rate": 8.467587473567945e-06, + "loss": 0.6698, + "num_input_tokens_seen": 7309472, + "step": 11210 + }, + { + "epoch": 6.612617924528302, + "grad_norm": 1.1318994760513306, + "learning_rate": 8.46573352901906e-06, + "loss": 0.342, + "num_input_tokens_seen": 7313152, + "step": 11215 + }, + { + "epoch": 6.615566037735849, + "grad_norm": 4.024058818817139, + "learning_rate": 8.463878666902286e-06, + "loss": 0.4557, + "num_input_tokens_seen": 7318112, + "step": 11220 + }, + { + "epoch": 6.618514150943396, + "grad_norm": 2.114833116531372, + "learning_rate": 8.462022887708706e-06, + "loss": 0.3286, + "num_input_tokens_seen": 7320800, + "step": 11225 + }, + { + "epoch": 6.621462264150943, + "grad_norm": 2.0244078636169434, + "learning_rate": 8.460166191929646e-06, + "loss": 0.4098, + "num_input_tokens_seen": 7324736, + "step": 11230 + }, + { + "epoch": 6.62441037735849, + "grad_norm": 1.5146896839141846, + "learning_rate": 8.458308580056675e-06, + "loss": 0.4455, + "num_input_tokens_seen": 7328352, + "step": 11235 + }, + { + "epoch": 6.627358490566038, + "grad_norm": 1.4580230712890625, + "learning_rate": 8.456450052581602e-06, + "loss": 0.4135, + "num_input_tokens_seen": 7332064, + "step": 11240 + }, + { + "epoch": 6.630306603773585, + "grad_norm": 1.9581786394119263, + "learning_rate": 8.45459060999648e-06, + "loss": 0.5028, + "num_input_tokens_seen": 7336224, + "step": 11245 + }, + { + "epoch": 6.633254716981132, + "grad_norm": 2.674583673477173, + "learning_rate": 8.452730252793608e-06, + "loss": 0.5647, + "num_input_tokens_seen": 7339424, + "step": 11250 + }, + { + "epoch": 6.6362028301886795, + "grad_norm": 1.0306998491287231, + "learning_rate": 8.450868981465519e-06, + "loss": 0.3141, + "num_input_tokens_seen": 7342912, + "step": 11255 + }, + { + "epoch": 6.639150943396227, + "grad_norm": 1.9173943996429443, + "learning_rate": 8.449006796504997e-06, + "loss": 0.3394, + "num_input_tokens_seen": 7345216, + "step": 11260 + }, + { + "epoch": 6.642099056603773, + "grad_norm": 1.431261658668518, + "learning_rate": 8.44714369840506e-06, + "loss": 0.4621, + "num_input_tokens_seen": 7348736, + "step": 11265 + }, + { + "epoch": 6.6450471698113205, + "grad_norm": 1.7577106952667236, + "learning_rate": 8.445279687658973e-06, + "loss": 0.3726, + "num_input_tokens_seen": 7351616, + "step": 11270 + }, + { + "epoch": 6.647995283018868, + "grad_norm": 1.814050555229187, + "learning_rate": 8.44341476476024e-06, + "loss": 0.3986, + "num_input_tokens_seen": 7359168, + "step": 11275 + }, + { + "epoch": 6.650943396226415, + "grad_norm": 1.8595256805419922, + "learning_rate": 8.441548930202608e-06, + "loss": 0.403, + "num_input_tokens_seen": 7361984, + "step": 11280 + }, + { + "epoch": 6.653891509433962, + "grad_norm": 1.3324761390686035, + "learning_rate": 8.439682184480065e-06, + "loss": 0.4786, + "num_input_tokens_seen": 7365216, + "step": 11285 + }, + { + "epoch": 6.65683962264151, + "grad_norm": 4.095402240753174, + "learning_rate": 8.437814528086837e-06, + "loss": 0.5055, + "num_input_tokens_seen": 7368512, + "step": 11290 + }, + { + "epoch": 6.659787735849057, + "grad_norm": 1.4543209075927734, + "learning_rate": 8.435945961517398e-06, + "loss": 0.4474, + "num_input_tokens_seen": 7371040, + "step": 11295 + }, + { + "epoch": 6.662735849056604, + "grad_norm": 2.130070686340332, + "learning_rate": 8.434076485266458e-06, + "loss": 0.4074, + "num_input_tokens_seen": 7374496, + "step": 11300 + }, + { + "epoch": 6.665683962264151, + "grad_norm": 2.02514386177063, + "learning_rate": 8.432206099828969e-06, + "loss": 0.5354, + "num_input_tokens_seen": 7378112, + "step": 11305 + }, + { + "epoch": 6.668632075471698, + "grad_norm": 1.762580394744873, + "learning_rate": 8.430334805700122e-06, + "loss": 0.3799, + "num_input_tokens_seen": 7381792, + "step": 11310 + }, + { + "epoch": 6.671580188679245, + "grad_norm": 1.644616961479187, + "learning_rate": 8.428462603375351e-06, + "loss": 0.5497, + "num_input_tokens_seen": 7384384, + "step": 11315 + }, + { + "epoch": 6.6745283018867925, + "grad_norm": 1.644087314605713, + "learning_rate": 8.426589493350332e-06, + "loss": 0.5424, + "num_input_tokens_seen": 7388032, + "step": 11320 + }, + { + "epoch": 6.67747641509434, + "grad_norm": 3.4967098236083984, + "learning_rate": 8.424715476120976e-06, + "loss": 0.6295, + "num_input_tokens_seen": 7391136, + "step": 11325 + }, + { + "epoch": 6.680424528301887, + "grad_norm": 3.3440873622894287, + "learning_rate": 8.422840552183437e-06, + "loss": 0.5037, + "num_input_tokens_seen": 7394560, + "step": 11330 + }, + { + "epoch": 6.683372641509434, + "grad_norm": 1.3333353996276855, + "learning_rate": 8.420964722034111e-06, + "loss": 0.3815, + "num_input_tokens_seen": 7398208, + "step": 11335 + }, + { + "epoch": 6.686320754716981, + "grad_norm": 1.8587840795516968, + "learning_rate": 8.41908798616963e-06, + "loss": 0.4193, + "num_input_tokens_seen": 7401600, + "step": 11340 + }, + { + "epoch": 6.689268867924528, + "grad_norm": 2.1222939491271973, + "learning_rate": 8.41721034508687e-06, + "loss": 0.3391, + "num_input_tokens_seen": 7405504, + "step": 11345 + }, + { + "epoch": 6.692216981132075, + "grad_norm": 2.993468761444092, + "learning_rate": 8.415331799282942e-06, + "loss": 0.3748, + "num_input_tokens_seen": 7408192, + "step": 11350 + }, + { + "epoch": 6.695165094339623, + "grad_norm": 1.5288426876068115, + "learning_rate": 8.413452349255205e-06, + "loss": 0.4104, + "num_input_tokens_seen": 7410752, + "step": 11355 + }, + { + "epoch": 6.69811320754717, + "grad_norm": 2.077700138092041, + "learning_rate": 8.411571995501245e-06, + "loss": 0.4477, + "num_input_tokens_seen": 7413376, + "step": 11360 + }, + { + "epoch": 6.701061320754717, + "grad_norm": 1.739641785621643, + "learning_rate": 8.409690738518895e-06, + "loss": 0.5333, + "num_input_tokens_seen": 7417344, + "step": 11365 + }, + { + "epoch": 6.7040094339622645, + "grad_norm": 2.0545406341552734, + "learning_rate": 8.407808578806229e-06, + "loss": 0.3967, + "num_input_tokens_seen": 7420608, + "step": 11370 + }, + { + "epoch": 6.706957547169811, + "grad_norm": 1.7327436208724976, + "learning_rate": 8.405925516861555e-06, + "loss": 0.3422, + "num_input_tokens_seen": 7423840, + "step": 11375 + }, + { + "epoch": 6.709905660377358, + "grad_norm": 1.6333808898925781, + "learning_rate": 8.40404155318342e-06, + "loss": 0.4422, + "num_input_tokens_seen": 7427840, + "step": 11380 + }, + { + "epoch": 6.712853773584905, + "grad_norm": 2.378512144088745, + "learning_rate": 8.402156688270613e-06, + "loss": 0.5032, + "num_input_tokens_seen": 7431328, + "step": 11385 + }, + { + "epoch": 6.715801886792453, + "grad_norm": 1.7582159042358398, + "learning_rate": 8.400270922622162e-06, + "loss": 0.4468, + "num_input_tokens_seen": 7435296, + "step": 11390 + }, + { + "epoch": 6.71875, + "grad_norm": 2.4809610843658447, + "learning_rate": 8.398384256737328e-06, + "loss": 0.4916, + "num_input_tokens_seen": 7438048, + "step": 11395 + }, + { + "epoch": 6.721698113207547, + "grad_norm": 2.156104326248169, + "learning_rate": 8.396496691115619e-06, + "loss": 0.6071, + "num_input_tokens_seen": 7441088, + "step": 11400 + }, + { + "epoch": 6.724646226415095, + "grad_norm": 3.0070583820343018, + "learning_rate": 8.39460822625677e-06, + "loss": 0.4441, + "num_input_tokens_seen": 7443360, + "step": 11405 + }, + { + "epoch": 6.727594339622642, + "grad_norm": 0.8194558620452881, + "learning_rate": 8.392718862660765e-06, + "loss": 0.3033, + "num_input_tokens_seen": 7447616, + "step": 11410 + }, + { + "epoch": 6.730542452830189, + "grad_norm": 2.067647933959961, + "learning_rate": 8.390828600827818e-06, + "loss": 0.4327, + "num_input_tokens_seen": 7450240, + "step": 11415 + }, + { + "epoch": 6.7334905660377355, + "grad_norm": 1.7567627429962158, + "learning_rate": 8.388937441258385e-06, + "loss": 0.516, + "num_input_tokens_seen": 7453664, + "step": 11420 + }, + { + "epoch": 6.736438679245283, + "grad_norm": 2.1843059062957764, + "learning_rate": 8.387045384453162e-06, + "loss": 0.4303, + "num_input_tokens_seen": 7456800, + "step": 11425 + }, + { + "epoch": 6.73938679245283, + "grad_norm": 2.202421188354492, + "learning_rate": 8.385152430913073e-06, + "loss": 0.4387, + "num_input_tokens_seen": 7459648, + "step": 11430 + }, + { + "epoch": 6.742334905660377, + "grad_norm": 4.489969730377197, + "learning_rate": 8.383258581139288e-06, + "loss": 0.4903, + "num_input_tokens_seen": 7463424, + "step": 11435 + }, + { + "epoch": 6.745283018867925, + "grad_norm": 1.502013921737671, + "learning_rate": 8.381363835633213e-06, + "loss": 0.451, + "num_input_tokens_seen": 7466144, + "step": 11440 + }, + { + "epoch": 6.748231132075472, + "grad_norm": 1.7006384134292603, + "learning_rate": 8.379468194896492e-06, + "loss": 0.4346, + "num_input_tokens_seen": 7468992, + "step": 11445 + }, + { + "epoch": 6.751179245283019, + "grad_norm": 6.214755058288574, + "learning_rate": 8.377571659431e-06, + "loss": 0.5312, + "num_input_tokens_seen": 7472288, + "step": 11450 + }, + { + "epoch": 6.754127358490566, + "grad_norm": 3.390658140182495, + "learning_rate": 8.375674229738855e-06, + "loss": 0.4775, + "num_input_tokens_seen": 7475648, + "step": 11455 + }, + { + "epoch": 6.757075471698113, + "grad_norm": 2.702378988265991, + "learning_rate": 8.37377590632241e-06, + "loss": 0.5368, + "num_input_tokens_seen": 7479712, + "step": 11460 + }, + { + "epoch": 6.76002358490566, + "grad_norm": 3.6077725887298584, + "learning_rate": 8.371876689684253e-06, + "loss": 0.454, + "num_input_tokens_seen": 7482688, + "step": 11465 + }, + { + "epoch": 6.7629716981132075, + "grad_norm": 1.4233475923538208, + "learning_rate": 8.369976580327211e-06, + "loss": 0.3797, + "num_input_tokens_seen": 7486176, + "step": 11470 + }, + { + "epoch": 6.765919811320755, + "grad_norm": 2.197141408920288, + "learning_rate": 8.368075578754345e-06, + "loss": 0.4165, + "num_input_tokens_seen": 7488384, + "step": 11475 + }, + { + "epoch": 6.768867924528302, + "grad_norm": 1.0275706052780151, + "learning_rate": 8.366173685468952e-06, + "loss": 0.4835, + "num_input_tokens_seen": 7491680, + "step": 11480 + }, + { + "epoch": 6.771816037735849, + "grad_norm": 1.7352893352508545, + "learning_rate": 8.364270900974572e-06, + "loss": 0.3695, + "num_input_tokens_seen": 7494592, + "step": 11485 + }, + { + "epoch": 6.774764150943396, + "grad_norm": 3.045090913772583, + "learning_rate": 8.362367225774968e-06, + "loss": 0.4804, + "num_input_tokens_seen": 7497632, + "step": 11490 + }, + { + "epoch": 6.777712264150943, + "grad_norm": 1.319770097732544, + "learning_rate": 8.360462660374153e-06, + "loss": 0.4748, + "num_input_tokens_seen": 7500096, + "step": 11495 + }, + { + "epoch": 6.78066037735849, + "grad_norm": 1.8948845863342285, + "learning_rate": 8.358557205276365e-06, + "loss": 0.4511, + "num_input_tokens_seen": 7503392, + "step": 11500 + }, + { + "epoch": 6.783608490566038, + "grad_norm": 1.3515570163726807, + "learning_rate": 8.356650860986083e-06, + "loss": 0.3506, + "num_input_tokens_seen": 7506304, + "step": 11505 + }, + { + "epoch": 6.786556603773585, + "grad_norm": 1.884287714958191, + "learning_rate": 8.354743628008017e-06, + "loss": 0.4215, + "num_input_tokens_seen": 7509376, + "step": 11510 + }, + { + "epoch": 6.789504716981132, + "grad_norm": 2.448120594024658, + "learning_rate": 8.35283550684712e-06, + "loss": 0.547, + "num_input_tokens_seen": 7512480, + "step": 11515 + }, + { + "epoch": 6.7924528301886795, + "grad_norm": 3.4300365447998047, + "learning_rate": 8.350926498008572e-06, + "loss": 0.586, + "num_input_tokens_seen": 7515552, + "step": 11520 + }, + { + "epoch": 6.795400943396227, + "grad_norm": 2.6889214515686035, + "learning_rate": 8.34901660199779e-06, + "loss": 0.3822, + "num_input_tokens_seen": 7518656, + "step": 11525 + }, + { + "epoch": 6.798349056603773, + "grad_norm": 1.9622069597244263, + "learning_rate": 8.347105819320432e-06, + "loss": 0.4835, + "num_input_tokens_seen": 7521952, + "step": 11530 + }, + { + "epoch": 6.8012971698113205, + "grad_norm": 2.5863547325134277, + "learning_rate": 8.34519415048238e-06, + "loss": 0.395, + "num_input_tokens_seen": 7525376, + "step": 11535 + }, + { + "epoch": 6.804245283018868, + "grad_norm": 2.9789795875549316, + "learning_rate": 8.343281595989761e-06, + "loss": 0.3832, + "num_input_tokens_seen": 7528128, + "step": 11540 + }, + { + "epoch": 6.807193396226415, + "grad_norm": 1.7870593070983887, + "learning_rate": 8.341368156348933e-06, + "loss": 0.4204, + "num_input_tokens_seen": 7531328, + "step": 11545 + }, + { + "epoch": 6.810141509433962, + "grad_norm": 8.119572639465332, + "learning_rate": 8.339453832066482e-06, + "loss": 0.4761, + "num_input_tokens_seen": 7533792, + "step": 11550 + }, + { + "epoch": 6.81308962264151, + "grad_norm": 2.8290703296661377, + "learning_rate": 8.337538623649237e-06, + "loss": 0.3677, + "num_input_tokens_seen": 7537216, + "step": 11555 + }, + { + "epoch": 6.816037735849057, + "grad_norm": 1.4916973114013672, + "learning_rate": 8.33562253160426e-06, + "loss": 0.5086, + "num_input_tokens_seen": 7539712, + "step": 11560 + }, + { + "epoch": 6.818985849056604, + "grad_norm": 3.5682928562164307, + "learning_rate": 8.33370555643884e-06, + "loss": 0.5623, + "num_input_tokens_seen": 7542848, + "step": 11565 + }, + { + "epoch": 6.821933962264151, + "grad_norm": 2.3088340759277344, + "learning_rate": 8.331787698660507e-06, + "loss": 0.3821, + "num_input_tokens_seen": 7545888, + "step": 11570 + }, + { + "epoch": 6.824882075471698, + "grad_norm": 1.5998623371124268, + "learning_rate": 8.32986895877702e-06, + "loss": 0.5281, + "num_input_tokens_seen": 7548864, + "step": 11575 + }, + { + "epoch": 6.827830188679245, + "grad_norm": 2.7926735877990723, + "learning_rate": 8.327949337296378e-06, + "loss": 0.3915, + "num_input_tokens_seen": 7551232, + "step": 11580 + }, + { + "epoch": 6.8307783018867925, + "grad_norm": 2.8393094539642334, + "learning_rate": 8.326028834726803e-06, + "loss": 0.4711, + "num_input_tokens_seen": 7555040, + "step": 11585 + }, + { + "epoch": 6.83372641509434, + "grad_norm": 2.076556921005249, + "learning_rate": 8.324107451576762e-06, + "loss": 0.4952, + "num_input_tokens_seen": 7557664, + "step": 11590 + }, + { + "epoch": 6.836674528301887, + "grad_norm": 1.7905797958374023, + "learning_rate": 8.322185188354947e-06, + "loss": 0.4391, + "num_input_tokens_seen": 7561856, + "step": 11595 + }, + { + "epoch": 6.839622641509434, + "grad_norm": 1.6176079511642456, + "learning_rate": 8.320262045570284e-06, + "loss": 0.3429, + "num_input_tokens_seen": 7565248, + "step": 11600 + }, + { + "epoch": 6.842570754716981, + "grad_norm": 2.686922311782837, + "learning_rate": 8.318338023731937e-06, + "loss": 0.3578, + "num_input_tokens_seen": 7569536, + "step": 11605 + }, + { + "epoch": 6.845518867924528, + "grad_norm": 2.6717967987060547, + "learning_rate": 8.316413123349296e-06, + "loss": 0.415, + "num_input_tokens_seen": 7572288, + "step": 11610 + }, + { + "epoch": 6.848466981132075, + "grad_norm": 1.9239338636398315, + "learning_rate": 8.314487344931987e-06, + "loss": 0.4335, + "num_input_tokens_seen": 7575520, + "step": 11615 + }, + { + "epoch": 6.851415094339623, + "grad_norm": 2.557251453399658, + "learning_rate": 8.31256068898987e-06, + "loss": 0.4561, + "num_input_tokens_seen": 7579264, + "step": 11620 + }, + { + "epoch": 6.85436320754717, + "grad_norm": 2.58467435836792, + "learning_rate": 8.310633156033032e-06, + "loss": 0.3629, + "num_input_tokens_seen": 7582496, + "step": 11625 + }, + { + "epoch": 6.857311320754717, + "grad_norm": 1.9937858581542969, + "learning_rate": 8.3087047465718e-06, + "loss": 0.4917, + "num_input_tokens_seen": 7586144, + "step": 11630 + }, + { + "epoch": 6.8602594339622645, + "grad_norm": 1.9986299276351929, + "learning_rate": 8.306775461116727e-06, + "loss": 0.4278, + "num_input_tokens_seen": 7589184, + "step": 11635 + }, + { + "epoch": 6.863207547169811, + "grad_norm": 1.6567435264587402, + "learning_rate": 8.304845300178597e-06, + "loss": 0.3716, + "num_input_tokens_seen": 7592480, + "step": 11640 + }, + { + "epoch": 6.866155660377358, + "grad_norm": 1.3341597318649292, + "learning_rate": 8.302914264268433e-06, + "loss": 0.4902, + "num_input_tokens_seen": 7595584, + "step": 11645 + }, + { + "epoch": 6.869103773584905, + "grad_norm": 1.602872371673584, + "learning_rate": 8.300982353897482e-06, + "loss": 0.3334, + "num_input_tokens_seen": 7599008, + "step": 11650 + }, + { + "epoch": 6.872051886792453, + "grad_norm": 2.1271252632141113, + "learning_rate": 8.299049569577226e-06, + "loss": 0.4292, + "num_input_tokens_seen": 7601728, + "step": 11655 + }, + { + "epoch": 6.875, + "grad_norm": 3.870276927947998, + "learning_rate": 8.297115911819379e-06, + "loss": 0.4201, + "num_input_tokens_seen": 7604288, + "step": 11660 + }, + { + "epoch": 6.877948113207547, + "grad_norm": 2.3912394046783447, + "learning_rate": 8.295181381135884e-06, + "loss": 0.4505, + "num_input_tokens_seen": 7607648, + "step": 11665 + }, + { + "epoch": 6.880896226415095, + "grad_norm": 1.5460768938064575, + "learning_rate": 8.293245978038917e-06, + "loss": 0.376, + "num_input_tokens_seen": 7611360, + "step": 11670 + }, + { + "epoch": 6.883844339622642, + "grad_norm": 2.4147145748138428, + "learning_rate": 8.291309703040884e-06, + "loss": 0.3271, + "num_input_tokens_seen": 7617664, + "step": 11675 + }, + { + "epoch": 6.886792452830189, + "grad_norm": 1.2025635242462158, + "learning_rate": 8.289372556654422e-06, + "loss": 0.4598, + "num_input_tokens_seen": 7620480, + "step": 11680 + }, + { + "epoch": 6.8897405660377355, + "grad_norm": 1.700533390045166, + "learning_rate": 8.287434539392401e-06, + "loss": 0.4873, + "num_input_tokens_seen": 7624352, + "step": 11685 + }, + { + "epoch": 6.892688679245283, + "grad_norm": 4.3826680183410645, + "learning_rate": 8.285495651767916e-06, + "loss": 0.4431, + "num_input_tokens_seen": 7627296, + "step": 11690 + }, + { + "epoch": 6.89563679245283, + "grad_norm": 2.450436592102051, + "learning_rate": 8.283555894294297e-06, + "loss": 0.4, + "num_input_tokens_seen": 7630816, + "step": 11695 + }, + { + "epoch": 6.898584905660377, + "grad_norm": 1.6770349740982056, + "learning_rate": 8.281615267485105e-06, + "loss": 0.5964, + "num_input_tokens_seen": 7633920, + "step": 11700 + }, + { + "epoch": 6.901533018867925, + "grad_norm": 1.8706599473953247, + "learning_rate": 8.279673771854127e-06, + "loss": 0.4768, + "num_input_tokens_seen": 7637920, + "step": 11705 + }, + { + "epoch": 6.904481132075472, + "grad_norm": 3.2704851627349854, + "learning_rate": 8.277731407915386e-06, + "loss": 0.541, + "num_input_tokens_seen": 7640800, + "step": 11710 + }, + { + "epoch": 6.907429245283019, + "grad_norm": 3.3839197158813477, + "learning_rate": 8.275788176183126e-06, + "loss": 0.3586, + "num_input_tokens_seen": 7644480, + "step": 11715 + }, + { + "epoch": 6.910377358490566, + "grad_norm": 1.4205790758132935, + "learning_rate": 8.273844077171827e-06, + "loss": 0.4119, + "num_input_tokens_seen": 7647936, + "step": 11720 + }, + { + "epoch": 6.913325471698113, + "grad_norm": 1.471185326576233, + "learning_rate": 8.271899111396202e-06, + "loss": 0.4406, + "num_input_tokens_seen": 7651264, + "step": 11725 + }, + { + "epoch": 6.91627358490566, + "grad_norm": 1.936965823173523, + "learning_rate": 8.269953279371185e-06, + "loss": 0.492, + "num_input_tokens_seen": 7654496, + "step": 11730 + }, + { + "epoch": 6.9192216981132075, + "grad_norm": 1.8686084747314453, + "learning_rate": 8.268006581611945e-06, + "loss": 0.4332, + "num_input_tokens_seen": 7657856, + "step": 11735 + }, + { + "epoch": 6.922169811320755, + "grad_norm": 2.4149816036224365, + "learning_rate": 8.266059018633878e-06, + "loss": 0.4429, + "num_input_tokens_seen": 7660192, + "step": 11740 + }, + { + "epoch": 6.925117924528302, + "grad_norm": 1.4742753505706787, + "learning_rate": 8.264110590952609e-06, + "loss": 0.3879, + "num_input_tokens_seen": 7663904, + "step": 11745 + }, + { + "epoch": 6.928066037735849, + "grad_norm": 1.7058687210083008, + "learning_rate": 8.262161299083993e-06, + "loss": 0.3115, + "num_input_tokens_seen": 7666688, + "step": 11750 + }, + { + "epoch": 6.931014150943396, + "grad_norm": 1.8989156484603882, + "learning_rate": 8.260211143544117e-06, + "loss": 0.3835, + "num_input_tokens_seen": 7670080, + "step": 11755 + }, + { + "epoch": 6.933962264150943, + "grad_norm": 2.674440860748291, + "learning_rate": 8.258260124849288e-06, + "loss": 0.4932, + "num_input_tokens_seen": 7672768, + "step": 11760 + }, + { + "epoch": 6.93691037735849, + "grad_norm": 1.8931361436843872, + "learning_rate": 8.256308243516048e-06, + "loss": 0.3607, + "num_input_tokens_seen": 7675744, + "step": 11765 + }, + { + "epoch": 6.939858490566038, + "grad_norm": 1.975437879562378, + "learning_rate": 8.254355500061168e-06, + "loss": 0.3709, + "num_input_tokens_seen": 7678400, + "step": 11770 + }, + { + "epoch": 6.942806603773585, + "grad_norm": 2.361424446105957, + "learning_rate": 8.252401895001643e-06, + "loss": 0.6518, + "num_input_tokens_seen": 7681760, + "step": 11775 + }, + { + "epoch": 6.945754716981132, + "grad_norm": 1.490308165550232, + "learning_rate": 8.2504474288547e-06, + "loss": 0.5036, + "num_input_tokens_seen": 7684480, + "step": 11780 + }, + { + "epoch": 6.9487028301886795, + "grad_norm": 1.8954814672470093, + "learning_rate": 8.248492102137791e-06, + "loss": 0.4585, + "num_input_tokens_seen": 7686816, + "step": 11785 + }, + { + "epoch": 6.951650943396227, + "grad_norm": 1.5862764120101929, + "learning_rate": 8.246535915368596e-06, + "loss": 0.4664, + "num_input_tokens_seen": 7691072, + "step": 11790 + }, + { + "epoch": 6.954599056603773, + "grad_norm": 1.515276312828064, + "learning_rate": 8.24457886906503e-06, + "loss": 0.3612, + "num_input_tokens_seen": 7695168, + "step": 11795 + }, + { + "epoch": 6.9575471698113205, + "grad_norm": 2.1550660133361816, + "learning_rate": 8.242620963745222e-06, + "loss": 0.3018, + "num_input_tokens_seen": 7698560, + "step": 11800 + }, + { + "epoch": 6.960495283018868, + "grad_norm": 2.737398386001587, + "learning_rate": 8.240662199927538e-06, + "loss": 0.4631, + "num_input_tokens_seen": 7702016, + "step": 11805 + }, + { + "epoch": 6.963443396226415, + "grad_norm": 3.3583672046661377, + "learning_rate": 8.238702578130573e-06, + "loss": 0.4655, + "num_input_tokens_seen": 7705376, + "step": 11810 + }, + { + "epoch": 6.966391509433962, + "grad_norm": 2.2687034606933594, + "learning_rate": 8.23674209887314e-06, + "loss": 0.6098, + "num_input_tokens_seen": 7708128, + "step": 11815 + }, + { + "epoch": 6.96933962264151, + "grad_norm": 1.68133544921875, + "learning_rate": 8.234780762674288e-06, + "loss": 0.3823, + "num_input_tokens_seen": 7711776, + "step": 11820 + }, + { + "epoch": 6.972287735849057, + "grad_norm": 1.64499032497406, + "learning_rate": 8.232818570053286e-06, + "loss": 0.4944, + "num_input_tokens_seen": 7714976, + "step": 11825 + }, + { + "epoch": 6.975235849056604, + "grad_norm": 2.100989818572998, + "learning_rate": 8.230855521529637e-06, + "loss": 0.483, + "num_input_tokens_seen": 7718432, + "step": 11830 + }, + { + "epoch": 6.978183962264151, + "grad_norm": 1.3968454599380493, + "learning_rate": 8.228891617623064e-06, + "loss": 0.6142, + "num_input_tokens_seen": 7720640, + "step": 11835 + }, + { + "epoch": 6.981132075471698, + "grad_norm": 2.4207065105438232, + "learning_rate": 8.22692685885352e-06, + "loss": 0.4694, + "num_input_tokens_seen": 7723232, + "step": 11840 + }, + { + "epoch": 6.984080188679245, + "grad_norm": 2.4904093742370605, + "learning_rate": 8.224961245741183e-06, + "loss": 0.3445, + "num_input_tokens_seen": 7727712, + "step": 11845 + }, + { + "epoch": 6.9870283018867925, + "grad_norm": 1.526089072227478, + "learning_rate": 8.222994778806457e-06, + "loss": 0.4319, + "num_input_tokens_seen": 7730816, + "step": 11850 + }, + { + "epoch": 6.98997641509434, + "grad_norm": 2.62384295463562, + "learning_rate": 8.221027458569972e-06, + "loss": 0.511, + "num_input_tokens_seen": 7734336, + "step": 11855 + }, + { + "epoch": 6.992924528301887, + "grad_norm": 2.4000675678253174, + "learning_rate": 8.219059285552586e-06, + "loss": 0.4371, + "num_input_tokens_seen": 7737504, + "step": 11860 + }, + { + "epoch": 6.995872641509434, + "grad_norm": 2.3739373683929443, + "learning_rate": 8.21709026027538e-06, + "loss": 0.5436, + "num_input_tokens_seen": 7741952, + "step": 11865 + }, + { + "epoch": 6.998820754716981, + "grad_norm": 2.0545756816864014, + "learning_rate": 8.215120383259664e-06, + "loss": 0.4869, + "num_input_tokens_seen": 7745280, + "step": 11870 + }, + { + "epoch": 7.001768867924528, + "grad_norm": 1.3189345598220825, + "learning_rate": 8.21314965502697e-06, + "loss": 0.4837, + "num_input_tokens_seen": 7747712, + "step": 11875 + }, + { + "epoch": 7.004716981132075, + "grad_norm": 1.4737904071807861, + "learning_rate": 8.211178076099056e-06, + "loss": 0.3015, + "num_input_tokens_seen": 7750848, + "step": 11880 + }, + { + "epoch": 7.007665094339623, + "grad_norm": 2.4677116870880127, + "learning_rate": 8.209205646997909e-06, + "loss": 0.3273, + "num_input_tokens_seen": 7754688, + "step": 11885 + }, + { + "epoch": 7.01061320754717, + "grad_norm": 1.9036543369293213, + "learning_rate": 8.207232368245735e-06, + "loss": 0.3342, + "num_input_tokens_seen": 7758688, + "step": 11890 + }, + { + "epoch": 7.013561320754717, + "grad_norm": 1.569035291671753, + "learning_rate": 8.205258240364968e-06, + "loss": 0.3225, + "num_input_tokens_seen": 7762048, + "step": 11895 + }, + { + "epoch": 7.0165094339622645, + "grad_norm": 2.288790225982666, + "learning_rate": 8.203283263878268e-06, + "loss": 0.3935, + "num_input_tokens_seen": 7764448, + "step": 11900 + }, + { + "epoch": 7.019457547169812, + "grad_norm": 2.6840505599975586, + "learning_rate": 8.201307439308518e-06, + "loss": 0.3824, + "num_input_tokens_seen": 7766816, + "step": 11905 + }, + { + "epoch": 7.022405660377358, + "grad_norm": 2.3313894271850586, + "learning_rate": 8.199330767178828e-06, + "loss": 0.4043, + "num_input_tokens_seen": 7770688, + "step": 11910 + }, + { + "epoch": 7.025353773584905, + "grad_norm": 2.881484031677246, + "learning_rate": 8.19735324801253e-06, + "loss": 0.5366, + "num_input_tokens_seen": 7773632, + "step": 11915 + }, + { + "epoch": 7.028301886792453, + "grad_norm": 2.8905062675476074, + "learning_rate": 8.195374882333178e-06, + "loss": 0.5753, + "num_input_tokens_seen": 7776224, + "step": 11920 + }, + { + "epoch": 7.03125, + "grad_norm": 2.180483341217041, + "learning_rate": 8.193395670664555e-06, + "loss": 0.4906, + "num_input_tokens_seen": 7778816, + "step": 11925 + }, + { + "epoch": 7.034198113207547, + "grad_norm": 1.6871193647384644, + "learning_rate": 8.191415613530667e-06, + "loss": 0.4363, + "num_input_tokens_seen": 7781056, + "step": 11930 + }, + { + "epoch": 7.037146226415095, + "grad_norm": 2.294992446899414, + "learning_rate": 8.189434711455739e-06, + "loss": 0.4459, + "num_input_tokens_seen": 7784064, + "step": 11935 + }, + { + "epoch": 7.040094339622642, + "grad_norm": 1.1303610801696777, + "learning_rate": 8.187452964964226e-06, + "loss": 0.416, + "num_input_tokens_seen": 7787616, + "step": 11940 + }, + { + "epoch": 7.043042452830188, + "grad_norm": 1.7973695993423462, + "learning_rate": 8.185470374580805e-06, + "loss": 0.5803, + "num_input_tokens_seen": 7791168, + "step": 11945 + }, + { + "epoch": 7.0459905660377355, + "grad_norm": 2.594310998916626, + "learning_rate": 8.183486940830371e-06, + "loss": 0.4739, + "num_input_tokens_seen": 7794784, + "step": 11950 + }, + { + "epoch": 7.048938679245283, + "grad_norm": 1.482409119606018, + "learning_rate": 8.18150266423805e-06, + "loss": 0.4084, + "num_input_tokens_seen": 7797984, + "step": 11955 + }, + { + "epoch": 7.05188679245283, + "grad_norm": 1.6465197801589966, + "learning_rate": 8.179517545329188e-06, + "loss": 0.4017, + "num_input_tokens_seen": 7801472, + "step": 11960 + }, + { + "epoch": 7.054834905660377, + "grad_norm": 2.653254508972168, + "learning_rate": 8.177531584629353e-06, + "loss": 0.2901, + "num_input_tokens_seen": 7803808, + "step": 11965 + }, + { + "epoch": 7.057783018867925, + "grad_norm": 2.5850114822387695, + "learning_rate": 8.175544782664335e-06, + "loss": 0.4454, + "num_input_tokens_seen": 7806528, + "step": 11970 + }, + { + "epoch": 7.060731132075472, + "grad_norm": 1.5508613586425781, + "learning_rate": 8.173557139960151e-06, + "loss": 0.4679, + "num_input_tokens_seen": 7809856, + "step": 11975 + }, + { + "epoch": 7.063679245283019, + "grad_norm": 2.1336863040924072, + "learning_rate": 8.17156865704304e-06, + "loss": 0.3541, + "num_input_tokens_seen": 7815200, + "step": 11980 + }, + { + "epoch": 7.066627358490566, + "grad_norm": 2.1790597438812256, + "learning_rate": 8.169579334439453e-06, + "loss": 0.5896, + "num_input_tokens_seen": 7817824, + "step": 11985 + }, + { + "epoch": 7.069575471698113, + "grad_norm": 2.0576560497283936, + "learning_rate": 8.16758917267608e-06, + "loss": 0.3398, + "num_input_tokens_seen": 7821856, + "step": 11990 + }, + { + "epoch": 7.07252358490566, + "grad_norm": 2.7320148944854736, + "learning_rate": 8.165598172279822e-06, + "loss": 0.4701, + "num_input_tokens_seen": 7824544, + "step": 11995 + }, + { + "epoch": 7.0754716981132075, + "grad_norm": 3.282135009765625, + "learning_rate": 8.163606333777804e-06, + "loss": 0.4128, + "num_input_tokens_seen": 7827328, + "step": 12000 + }, + { + "epoch": 7.078419811320755, + "grad_norm": 1.6186277866363525, + "learning_rate": 8.161613657697374e-06, + "loss": 0.421, + "num_input_tokens_seen": 7830880, + "step": 12005 + }, + { + "epoch": 7.081367924528302, + "grad_norm": 2.187924861907959, + "learning_rate": 8.159620144566103e-06, + "loss": 0.3233, + "num_input_tokens_seen": 7833920, + "step": 12010 + }, + { + "epoch": 7.084316037735849, + "grad_norm": 2.0952911376953125, + "learning_rate": 8.157625794911782e-06, + "loss": 0.3889, + "num_input_tokens_seen": 7836768, + "step": 12015 + }, + { + "epoch": 7.087264150943396, + "grad_norm": 5.083712577819824, + "learning_rate": 8.155630609262424e-06, + "loss": 0.3213, + "num_input_tokens_seen": 7840576, + "step": 12020 + }, + { + "epoch": 7.090212264150943, + "grad_norm": 1.8995298147201538, + "learning_rate": 8.153634588146262e-06, + "loss": 0.371, + "num_input_tokens_seen": 7844000, + "step": 12025 + }, + { + "epoch": 7.09316037735849, + "grad_norm": 2.383364677429199, + "learning_rate": 8.15163773209175e-06, + "loss": 0.4403, + "num_input_tokens_seen": 7847072, + "step": 12030 + }, + { + "epoch": 7.096108490566038, + "grad_norm": 2.094858169555664, + "learning_rate": 8.149640041627566e-06, + "loss": 0.3444, + "num_input_tokens_seen": 7851264, + "step": 12035 + }, + { + "epoch": 7.099056603773585, + "grad_norm": 1.9967538118362427, + "learning_rate": 8.147641517282608e-06, + "loss": 0.5278, + "num_input_tokens_seen": 7854656, + "step": 12040 + }, + { + "epoch": 7.102004716981132, + "grad_norm": 2.886871099472046, + "learning_rate": 8.145642159585992e-06, + "loss": 0.5214, + "num_input_tokens_seen": 7859520, + "step": 12045 + }, + { + "epoch": 7.1049528301886795, + "grad_norm": 1.9680029153823853, + "learning_rate": 8.143641969067057e-06, + "loss": 0.3462, + "num_input_tokens_seen": 7862400, + "step": 12050 + }, + { + "epoch": 7.107900943396227, + "grad_norm": 2.7757222652435303, + "learning_rate": 8.141640946255362e-06, + "loss": 0.3447, + "num_input_tokens_seen": 7865088, + "step": 12055 + }, + { + "epoch": 7.110849056603773, + "grad_norm": 2.1096463203430176, + "learning_rate": 8.139639091680687e-06, + "loss": 0.3318, + "num_input_tokens_seen": 7868736, + "step": 12060 + }, + { + "epoch": 7.1137971698113205, + "grad_norm": 2.77583646774292, + "learning_rate": 8.137636405873031e-06, + "loss": 0.5793, + "num_input_tokens_seen": 7871552, + "step": 12065 + }, + { + "epoch": 7.116745283018868, + "grad_norm": 2.2539403438568115, + "learning_rate": 8.135632889362614e-06, + "loss": 0.4085, + "num_input_tokens_seen": 7874240, + "step": 12070 + }, + { + "epoch": 7.119693396226415, + "grad_norm": 3.021364212036133, + "learning_rate": 8.133628542679879e-06, + "loss": 0.4844, + "num_input_tokens_seen": 7877632, + "step": 12075 + }, + { + "epoch": 7.122641509433962, + "grad_norm": 4.49337100982666, + "learning_rate": 8.131623366355478e-06, + "loss": 0.4264, + "num_input_tokens_seen": 7880160, + "step": 12080 + }, + { + "epoch": 7.12558962264151, + "grad_norm": 1.7100971937179565, + "learning_rate": 8.129617360920297e-06, + "loss": 0.4005, + "num_input_tokens_seen": 7883136, + "step": 12085 + }, + { + "epoch": 7.128537735849057, + "grad_norm": 2.2806668281555176, + "learning_rate": 8.12761052690543e-06, + "loss": 0.2636, + "num_input_tokens_seen": 7885408, + "step": 12090 + }, + { + "epoch": 7.131485849056604, + "grad_norm": 2.138929605484009, + "learning_rate": 8.125602864842197e-06, + "loss": 0.4684, + "num_input_tokens_seen": 7888224, + "step": 12095 + }, + { + "epoch": 7.134433962264151, + "grad_norm": 2.133676528930664, + "learning_rate": 8.123594375262135e-06, + "loss": 0.4637, + "num_input_tokens_seen": 7891808, + "step": 12100 + }, + { + "epoch": 7.137382075471698, + "grad_norm": 2.332988739013672, + "learning_rate": 8.121585058697e-06, + "loss": 0.4083, + "num_input_tokens_seen": 7894912, + "step": 12105 + }, + { + "epoch": 7.140330188679245, + "grad_norm": 1.7388025522232056, + "learning_rate": 8.119574915678767e-06, + "loss": 0.5051, + "num_input_tokens_seen": 7898560, + "step": 12110 + }, + { + "epoch": 7.1432783018867925, + "grad_norm": 2.162344455718994, + "learning_rate": 8.117563946739632e-06, + "loss": 0.345, + "num_input_tokens_seen": 7901248, + "step": 12115 + }, + { + "epoch": 7.14622641509434, + "grad_norm": 2.1779322624206543, + "learning_rate": 8.115552152412006e-06, + "loss": 0.3347, + "num_input_tokens_seen": 7905088, + "step": 12120 + }, + { + "epoch": 7.149174528301887, + "grad_norm": 2.9986374378204346, + "learning_rate": 8.11353953322852e-06, + "loss": 0.4253, + "num_input_tokens_seen": 7907936, + "step": 12125 + }, + { + "epoch": 7.152122641509434, + "grad_norm": 5.630772590637207, + "learning_rate": 8.111526089722024e-06, + "loss": 0.424, + "num_input_tokens_seen": 7910496, + "step": 12130 + }, + { + "epoch": 7.155070754716981, + "grad_norm": 1.6223100423812866, + "learning_rate": 8.109511822425586e-06, + "loss": 0.4137, + "num_input_tokens_seen": 7913856, + "step": 12135 + }, + { + "epoch": 7.158018867924528, + "grad_norm": 2.9513845443725586, + "learning_rate": 8.107496731872491e-06, + "loss": 0.3229, + "num_input_tokens_seen": 7919200, + "step": 12140 + }, + { + "epoch": 7.160966981132075, + "grad_norm": 3.971353769302368, + "learning_rate": 8.105480818596243e-06, + "loss": 0.3888, + "num_input_tokens_seen": 7922048, + "step": 12145 + }, + { + "epoch": 7.163915094339623, + "grad_norm": 2.063004493713379, + "learning_rate": 8.103464083130566e-06, + "loss": 0.401, + "num_input_tokens_seen": 7924672, + "step": 12150 + }, + { + "epoch": 7.16686320754717, + "grad_norm": 2.603506326675415, + "learning_rate": 8.101446526009397e-06, + "loss": 0.4345, + "num_input_tokens_seen": 7927744, + "step": 12155 + }, + { + "epoch": 7.169811320754717, + "grad_norm": 2.417618989944458, + "learning_rate": 8.099428147766894e-06, + "loss": 0.347, + "num_input_tokens_seen": 7930816, + "step": 12160 + }, + { + "epoch": 7.1727594339622645, + "grad_norm": 3.124481439590454, + "learning_rate": 8.097408948937431e-06, + "loss": 0.4496, + "num_input_tokens_seen": 7933408, + "step": 12165 + }, + { + "epoch": 7.175707547169812, + "grad_norm": 1.0909606218338013, + "learning_rate": 8.095388930055599e-06, + "loss": 0.3722, + "num_input_tokens_seen": 7937216, + "step": 12170 + }, + { + "epoch": 7.178655660377358, + "grad_norm": 3.3752760887145996, + "learning_rate": 8.093368091656209e-06, + "loss": 0.3512, + "num_input_tokens_seen": 7940800, + "step": 12175 + }, + { + "epoch": 7.181603773584905, + "grad_norm": 2.5349836349487305, + "learning_rate": 8.091346434274284e-06, + "loss": 0.4128, + "num_input_tokens_seen": 7944000, + "step": 12180 + }, + { + "epoch": 7.184551886792453, + "grad_norm": 2.0393357276916504, + "learning_rate": 8.089323958445068e-06, + "loss": 0.4007, + "num_input_tokens_seen": 7946144, + "step": 12185 + }, + { + "epoch": 7.1875, + "grad_norm": 3.878053903579712, + "learning_rate": 8.08730066470402e-06, + "loss": 0.4061, + "num_input_tokens_seen": 7948416, + "step": 12190 + }, + { + "epoch": 7.190448113207547, + "grad_norm": 2.6362884044647217, + "learning_rate": 8.085276553586814e-06, + "loss": 0.3687, + "num_input_tokens_seen": 7950976, + "step": 12195 + }, + { + "epoch": 7.193396226415095, + "grad_norm": 1.5253713130950928, + "learning_rate": 8.083251625629345e-06, + "loss": 0.4074, + "num_input_tokens_seen": 7953824, + "step": 12200 + }, + { + "epoch": 7.196344339622642, + "grad_norm": 3.064913272857666, + "learning_rate": 8.08122588136772e-06, + "loss": 0.4512, + "num_input_tokens_seen": 7956576, + "step": 12205 + }, + { + "epoch": 7.199292452830188, + "grad_norm": 2.297760248184204, + "learning_rate": 8.079199321338262e-06, + "loss": 0.4588, + "num_input_tokens_seen": 7959584, + "step": 12210 + }, + { + "epoch": 7.2022405660377355, + "grad_norm": 1.8918788433074951, + "learning_rate": 8.077171946077516e-06, + "loss": 0.411, + "num_input_tokens_seen": 7963712, + "step": 12215 + }, + { + "epoch": 7.205188679245283, + "grad_norm": 3.343000888824463, + "learning_rate": 8.075143756122232e-06, + "loss": 0.415, + "num_input_tokens_seen": 7967680, + "step": 12220 + }, + { + "epoch": 7.20813679245283, + "grad_norm": 2.3498241901397705, + "learning_rate": 8.073114752009388e-06, + "loss": 0.379, + "num_input_tokens_seen": 7970464, + "step": 12225 + }, + { + "epoch": 7.211084905660377, + "grad_norm": 2.0956473350524902, + "learning_rate": 8.071084934276168e-06, + "loss": 0.5107, + "num_input_tokens_seen": 7973696, + "step": 12230 + }, + { + "epoch": 7.214033018867925, + "grad_norm": 1.7145568132400513, + "learning_rate": 8.069054303459976e-06, + "loss": 0.3574, + "num_input_tokens_seen": 7976352, + "step": 12235 + }, + { + "epoch": 7.216981132075472, + "grad_norm": 2.117650270462036, + "learning_rate": 8.06702286009843e-06, + "loss": 0.5213, + "num_input_tokens_seen": 7979712, + "step": 12240 + }, + { + "epoch": 7.219929245283019, + "grad_norm": 2.0798380374908447, + "learning_rate": 8.064990604729363e-06, + "loss": 0.4471, + "num_input_tokens_seen": 7982656, + "step": 12245 + }, + { + "epoch": 7.222877358490566, + "grad_norm": 2.9221270084381104, + "learning_rate": 8.062957537890827e-06, + "loss": 0.3829, + "num_input_tokens_seen": 7985408, + "step": 12250 + }, + { + "epoch": 7.225825471698113, + "grad_norm": 2.000239372253418, + "learning_rate": 8.060923660121081e-06, + "loss": 0.4369, + "num_input_tokens_seen": 7988128, + "step": 12255 + }, + { + "epoch": 7.22877358490566, + "grad_norm": 2.1756668090820312, + "learning_rate": 8.058888971958603e-06, + "loss": 0.4046, + "num_input_tokens_seen": 7991136, + "step": 12260 + }, + { + "epoch": 7.2317216981132075, + "grad_norm": 1.5905725955963135, + "learning_rate": 8.056853473942085e-06, + "loss": 0.3102, + "num_input_tokens_seen": 7994176, + "step": 12265 + }, + { + "epoch": 7.234669811320755, + "grad_norm": 6.826207160949707, + "learning_rate": 8.054817166610438e-06, + "loss": 0.4642, + "num_input_tokens_seen": 7997184, + "step": 12270 + }, + { + "epoch": 7.237617924528302, + "grad_norm": 1.7690850496292114, + "learning_rate": 8.052780050502781e-06, + "loss": 0.3713, + "num_input_tokens_seen": 8001056, + "step": 12275 + }, + { + "epoch": 7.240566037735849, + "grad_norm": 2.6282012462615967, + "learning_rate": 8.050742126158448e-06, + "loss": 0.4258, + "num_input_tokens_seen": 8004416, + "step": 12280 + }, + { + "epoch": 7.243514150943396, + "grad_norm": 1.553707242012024, + "learning_rate": 8.04870339411699e-06, + "loss": 0.5149, + "num_input_tokens_seen": 8008768, + "step": 12285 + }, + { + "epoch": 7.246462264150943, + "grad_norm": 2.539104461669922, + "learning_rate": 8.046663854918166e-06, + "loss": 0.4366, + "num_input_tokens_seen": 8012192, + "step": 12290 + }, + { + "epoch": 7.24941037735849, + "grad_norm": 1.9790639877319336, + "learning_rate": 8.044623509101959e-06, + "loss": 0.3766, + "num_input_tokens_seen": 8015616, + "step": 12295 + }, + { + "epoch": 7.252358490566038, + "grad_norm": 2.757936954498291, + "learning_rate": 8.042582357208557e-06, + "loss": 0.3876, + "num_input_tokens_seen": 8018752, + "step": 12300 + }, + { + "epoch": 7.255306603773585, + "grad_norm": 1.4044712781906128, + "learning_rate": 8.04054039977836e-06, + "loss": 0.4784, + "num_input_tokens_seen": 8022656, + "step": 12305 + }, + { + "epoch": 7.258254716981132, + "grad_norm": 0.9646516442298889, + "learning_rate": 8.038497637351992e-06, + "loss": 0.3863, + "num_input_tokens_seen": 8025984, + "step": 12310 + }, + { + "epoch": 7.2612028301886795, + "grad_norm": 4.003399848937988, + "learning_rate": 8.036454070470276e-06, + "loss": 0.523, + "num_input_tokens_seen": 8028352, + "step": 12315 + }, + { + "epoch": 7.264150943396227, + "grad_norm": 2.3622994422912598, + "learning_rate": 8.03440969967426e-06, + "loss": 0.4538, + "num_input_tokens_seen": 8030912, + "step": 12320 + }, + { + "epoch": 7.267099056603773, + "grad_norm": 2.335052967071533, + "learning_rate": 8.032364525505198e-06, + "loss": 0.3874, + "num_input_tokens_seen": 8034432, + "step": 12325 + }, + { + "epoch": 7.2700471698113205, + "grad_norm": 2.0673749446868896, + "learning_rate": 8.030318548504561e-06, + "loss": 0.6288, + "num_input_tokens_seen": 8037440, + "step": 12330 + }, + { + "epoch": 7.272995283018868, + "grad_norm": 1.7539925575256348, + "learning_rate": 8.028271769214026e-06, + "loss": 0.3659, + "num_input_tokens_seen": 8040064, + "step": 12335 + }, + { + "epoch": 7.275943396226415, + "grad_norm": 2.6530513763427734, + "learning_rate": 8.02622418817549e-06, + "loss": 0.4535, + "num_input_tokens_seen": 8043840, + "step": 12340 + }, + { + "epoch": 7.278891509433962, + "grad_norm": 3.1688613891601562, + "learning_rate": 8.024175805931056e-06, + "loss": 0.5111, + "num_input_tokens_seen": 8047392, + "step": 12345 + }, + { + "epoch": 7.28183962264151, + "grad_norm": 2.2103347778320312, + "learning_rate": 8.022126623023045e-06, + "loss": 0.398, + "num_input_tokens_seen": 8050080, + "step": 12350 + }, + { + "epoch": 7.284787735849057, + "grad_norm": 1.5307804346084595, + "learning_rate": 8.020076639993987e-06, + "loss": 0.4353, + "num_input_tokens_seen": 8053504, + "step": 12355 + }, + { + "epoch": 7.287735849056604, + "grad_norm": 1.8239808082580566, + "learning_rate": 8.018025857386622e-06, + "loss": 0.3766, + "num_input_tokens_seen": 8056256, + "step": 12360 + }, + { + "epoch": 7.290683962264151, + "grad_norm": 1.2403656244277954, + "learning_rate": 8.015974275743905e-06, + "loss": 0.3381, + "num_input_tokens_seen": 8059616, + "step": 12365 + }, + { + "epoch": 7.293632075471698, + "grad_norm": 2.1340689659118652, + "learning_rate": 8.013921895609e-06, + "loss": 0.2365, + "num_input_tokens_seen": 8062944, + "step": 12370 + }, + { + "epoch": 7.296580188679245, + "grad_norm": 1.7445706129074097, + "learning_rate": 8.011868717525283e-06, + "loss": 0.3709, + "num_input_tokens_seen": 8067008, + "step": 12375 + }, + { + "epoch": 7.2995283018867925, + "grad_norm": 3.3448872566223145, + "learning_rate": 8.009814742036343e-06, + "loss": 0.4789, + "num_input_tokens_seen": 8069632, + "step": 12380 + }, + { + "epoch": 7.30247641509434, + "grad_norm": 2.8965771198272705, + "learning_rate": 8.007759969685979e-06, + "loss": 0.3749, + "num_input_tokens_seen": 8072160, + "step": 12385 + }, + { + "epoch": 7.305424528301887, + "grad_norm": 2.9302456378936768, + "learning_rate": 8.005704401018199e-06, + "loss": 0.4917, + "num_input_tokens_seen": 8075488, + "step": 12390 + }, + { + "epoch": 7.308372641509434, + "grad_norm": 4.608858585357666, + "learning_rate": 8.003648036577226e-06, + "loss": 0.4914, + "num_input_tokens_seen": 8079040, + "step": 12395 + }, + { + "epoch": 7.311320754716981, + "grad_norm": 2.5446343421936035, + "learning_rate": 8.00159087690749e-06, + "loss": 0.3792, + "num_input_tokens_seen": 8081728, + "step": 12400 + }, + { + "epoch": 7.314268867924528, + "grad_norm": 1.6422923803329468, + "learning_rate": 7.999532922553635e-06, + "loss": 0.3497, + "num_input_tokens_seen": 8084832, + "step": 12405 + }, + { + "epoch": 7.317216981132075, + "grad_norm": 1.8990668058395386, + "learning_rate": 7.997474174060508e-06, + "loss": 0.4747, + "num_input_tokens_seen": 8088416, + "step": 12410 + }, + { + "epoch": 7.320165094339623, + "grad_norm": 2.0357236862182617, + "learning_rate": 7.995414631973179e-06, + "loss": 0.2893, + "num_input_tokens_seen": 8091584, + "step": 12415 + }, + { + "epoch": 7.32311320754717, + "grad_norm": 2.379852056503296, + "learning_rate": 7.993354296836914e-06, + "loss": 0.2884, + "num_input_tokens_seen": 8094208, + "step": 12420 + }, + { + "epoch": 7.326061320754717, + "grad_norm": 2.0941812992095947, + "learning_rate": 7.991293169197198e-06, + "loss": 0.3217, + "num_input_tokens_seen": 8096864, + "step": 12425 + }, + { + "epoch": 7.3290094339622645, + "grad_norm": 3.003298759460449, + "learning_rate": 7.989231249599725e-06, + "loss": 0.4849, + "num_input_tokens_seen": 8099712, + "step": 12430 + }, + { + "epoch": 7.331957547169811, + "grad_norm": 2.5988895893096924, + "learning_rate": 7.987168538590395e-06, + "loss": 0.3672, + "num_input_tokens_seen": 8102400, + "step": 12435 + }, + { + "epoch": 7.334905660377358, + "grad_norm": 0.8338944911956787, + "learning_rate": 7.985105036715322e-06, + "loss": 0.2029, + "num_input_tokens_seen": 8108288, + "step": 12440 + }, + { + "epoch": 7.337853773584905, + "grad_norm": 1.8371944427490234, + "learning_rate": 7.983040744520823e-06, + "loss": 0.2693, + "num_input_tokens_seen": 8111008, + "step": 12445 + }, + { + "epoch": 7.340801886792453, + "grad_norm": 1.773409128189087, + "learning_rate": 7.980975662553432e-06, + "loss": 0.4571, + "num_input_tokens_seen": 8115008, + "step": 12450 + }, + { + "epoch": 7.34375, + "grad_norm": 1.9357980489730835, + "learning_rate": 7.978909791359888e-06, + "loss": 0.3693, + "num_input_tokens_seen": 8118752, + "step": 12455 + }, + { + "epoch": 7.346698113207547, + "grad_norm": 1.732206106185913, + "learning_rate": 7.976843131487136e-06, + "loss": 0.5031, + "num_input_tokens_seen": 8122048, + "step": 12460 + }, + { + "epoch": 7.349646226415095, + "grad_norm": 2.9061222076416016, + "learning_rate": 7.974775683482337e-06, + "loss": 0.4317, + "num_input_tokens_seen": 8124448, + "step": 12465 + }, + { + "epoch": 7.352594339622642, + "grad_norm": 1.3417195081710815, + "learning_rate": 7.972707447892855e-06, + "loss": 0.5957, + "num_input_tokens_seen": 8128096, + "step": 12470 + }, + { + "epoch": 7.355542452830189, + "grad_norm": 2.0616581439971924, + "learning_rate": 7.970638425266264e-06, + "loss": 0.5007, + "num_input_tokens_seen": 8131488, + "step": 12475 + }, + { + "epoch": 7.3584905660377355, + "grad_norm": 2.0164268016815186, + "learning_rate": 7.968568616150349e-06, + "loss": 0.3962, + "num_input_tokens_seen": 8134688, + "step": 12480 + }, + { + "epoch": 7.361438679245283, + "grad_norm": 1.1332072019577026, + "learning_rate": 7.966498021093096e-06, + "loss": 0.3502, + "num_input_tokens_seen": 8138368, + "step": 12485 + }, + { + "epoch": 7.36438679245283, + "grad_norm": 19.94340705871582, + "learning_rate": 7.96442664064271e-06, + "loss": 0.5062, + "num_input_tokens_seen": 8141408, + "step": 12490 + }, + { + "epoch": 7.367334905660377, + "grad_norm": 2.6861815452575684, + "learning_rate": 7.962354475347593e-06, + "loss": 0.4325, + "num_input_tokens_seen": 8143872, + "step": 12495 + }, + { + "epoch": 7.370283018867925, + "grad_norm": 1.2617459297180176, + "learning_rate": 7.960281525756364e-06, + "loss": 0.2763, + "num_input_tokens_seen": 8148096, + "step": 12500 + }, + { + "epoch": 7.373231132075472, + "grad_norm": 3.2648725509643555, + "learning_rate": 7.95820779241784e-06, + "loss": 0.5184, + "num_input_tokens_seen": 8150848, + "step": 12505 + }, + { + "epoch": 7.376179245283019, + "grad_norm": 2.480302095413208, + "learning_rate": 7.956133275881055e-06, + "loss": 0.4826, + "num_input_tokens_seen": 8155232, + "step": 12510 + }, + { + "epoch": 7.379127358490566, + "grad_norm": 1.5568878650665283, + "learning_rate": 7.954057976695244e-06, + "loss": 0.4783, + "num_input_tokens_seen": 8158912, + "step": 12515 + }, + { + "epoch": 7.382075471698113, + "grad_norm": 1.5117324590682983, + "learning_rate": 7.951981895409854e-06, + "loss": 0.3174, + "num_input_tokens_seen": 8161696, + "step": 12520 + }, + { + "epoch": 7.38502358490566, + "grad_norm": 3.119663953781128, + "learning_rate": 7.949905032574534e-06, + "loss": 0.4056, + "num_input_tokens_seen": 8166272, + "step": 12525 + }, + { + "epoch": 7.3879716981132075, + "grad_norm": 4.333902835845947, + "learning_rate": 7.947827388739145e-06, + "loss": 0.5553, + "num_input_tokens_seen": 8169504, + "step": 12530 + }, + { + "epoch": 7.390919811320755, + "grad_norm": 3.6118650436401367, + "learning_rate": 7.945748964453747e-06, + "loss": 0.468, + "num_input_tokens_seen": 8172800, + "step": 12535 + }, + { + "epoch": 7.393867924528302, + "grad_norm": 1.6763180494308472, + "learning_rate": 7.943669760268618e-06, + "loss": 0.511, + "num_input_tokens_seen": 8176480, + "step": 12540 + }, + { + "epoch": 7.396816037735849, + "grad_norm": 2.1456193923950195, + "learning_rate": 7.941589776734232e-06, + "loss": 0.4246, + "num_input_tokens_seen": 8179616, + "step": 12545 + }, + { + "epoch": 7.399764150943396, + "grad_norm": 2.367192029953003, + "learning_rate": 7.939509014401277e-06, + "loss": 0.3964, + "num_input_tokens_seen": 8183488, + "step": 12550 + }, + { + "epoch": 7.402712264150943, + "grad_norm": 3.201099395751953, + "learning_rate": 7.93742747382064e-06, + "loss": 0.4537, + "num_input_tokens_seen": 8186592, + "step": 12555 + }, + { + "epoch": 7.40566037735849, + "grad_norm": 1.5436292886734009, + "learning_rate": 7.93534515554342e-06, + "loss": 0.3649, + "num_input_tokens_seen": 8190048, + "step": 12560 + }, + { + "epoch": 7.408608490566038, + "grad_norm": 2.39947509765625, + "learning_rate": 7.933262060120918e-06, + "loss": 0.4374, + "num_input_tokens_seen": 8193440, + "step": 12565 + }, + { + "epoch": 7.411556603773585, + "grad_norm": 1.8190984725952148, + "learning_rate": 7.931178188104646e-06, + "loss": 0.4411, + "num_input_tokens_seen": 8196736, + "step": 12570 + }, + { + "epoch": 7.414504716981132, + "grad_norm": 1.9123783111572266, + "learning_rate": 7.929093540046317e-06, + "loss": 0.5102, + "num_input_tokens_seen": 8199456, + "step": 12575 + }, + { + "epoch": 7.4174528301886795, + "grad_norm": 2.235212802886963, + "learning_rate": 7.927008116497848e-06, + "loss": 0.6973, + "num_input_tokens_seen": 8203360, + "step": 12580 + }, + { + "epoch": 7.420400943396227, + "grad_norm": 1.9296789169311523, + "learning_rate": 7.924921918011366e-06, + "loss": 0.4744, + "num_input_tokens_seen": 8207264, + "step": 12585 + }, + { + "epoch": 7.423349056603773, + "grad_norm": 2.060218095779419, + "learning_rate": 7.9228349451392e-06, + "loss": 0.3945, + "num_input_tokens_seen": 8210528, + "step": 12590 + }, + { + "epoch": 7.4262971698113205, + "grad_norm": 1.3163892030715942, + "learning_rate": 7.920747198433884e-06, + "loss": 0.3177, + "num_input_tokens_seen": 8213728, + "step": 12595 + }, + { + "epoch": 7.429245283018868, + "grad_norm": 1.5851162672042847, + "learning_rate": 7.91865867844816e-06, + "loss": 0.4901, + "num_input_tokens_seen": 8217856, + "step": 12600 + }, + { + "epoch": 7.432193396226415, + "grad_norm": 1.8003064393997192, + "learning_rate": 7.916569385734976e-06, + "loss": 0.3317, + "num_input_tokens_seen": 8220000, + "step": 12605 + }, + { + "epoch": 7.435141509433962, + "grad_norm": 1.8168344497680664, + "learning_rate": 7.914479320847474e-06, + "loss": 0.3992, + "num_input_tokens_seen": 8223040, + "step": 12610 + }, + { + "epoch": 7.43808962264151, + "grad_norm": 2.325937509536743, + "learning_rate": 7.912388484339012e-06, + "loss": 0.3922, + "num_input_tokens_seen": 8225344, + "step": 12615 + }, + { + "epoch": 7.441037735849057, + "grad_norm": 3.7619271278381348, + "learning_rate": 7.910296876763147e-06, + "loss": 0.4717, + "num_input_tokens_seen": 8227840, + "step": 12620 + }, + { + "epoch": 7.443985849056604, + "grad_norm": 1.8331016302108765, + "learning_rate": 7.90820449867364e-06, + "loss": 0.4376, + "num_input_tokens_seen": 8231232, + "step": 12625 + }, + { + "epoch": 7.446933962264151, + "grad_norm": 2.1001436710357666, + "learning_rate": 7.90611135062446e-06, + "loss": 0.414, + "num_input_tokens_seen": 8234400, + "step": 12630 + }, + { + "epoch": 7.449882075471698, + "grad_norm": 2.465538740158081, + "learning_rate": 7.904017433169775e-06, + "loss": 0.4386, + "num_input_tokens_seen": 8237312, + "step": 12635 + }, + { + "epoch": 7.452830188679245, + "grad_norm": 5.104916095733643, + "learning_rate": 7.901922746863957e-06, + "loss": 0.4274, + "num_input_tokens_seen": 8240864, + "step": 12640 + }, + { + "epoch": 7.4557783018867925, + "grad_norm": 1.9842617511749268, + "learning_rate": 7.899827292261589e-06, + "loss": 0.4209, + "num_input_tokens_seen": 8244256, + "step": 12645 + }, + { + "epoch": 7.45872641509434, + "grad_norm": 2.355825424194336, + "learning_rate": 7.897731069917444e-06, + "loss": 0.3408, + "num_input_tokens_seen": 8248064, + "step": 12650 + }, + { + "epoch": 7.461674528301887, + "grad_norm": 3.1829774379730225, + "learning_rate": 7.895634080386512e-06, + "loss": 0.4101, + "num_input_tokens_seen": 8251648, + "step": 12655 + }, + { + "epoch": 7.464622641509434, + "grad_norm": 2.1236677169799805, + "learning_rate": 7.893536324223977e-06, + "loss": 0.4067, + "num_input_tokens_seen": 8255456, + "step": 12660 + }, + { + "epoch": 7.467570754716981, + "grad_norm": 2.7250266075134277, + "learning_rate": 7.89143780198523e-06, + "loss": 0.4408, + "num_input_tokens_seen": 8258208, + "step": 12665 + }, + { + "epoch": 7.470518867924528, + "grad_norm": 3.2099993228912354, + "learning_rate": 7.889338514225862e-06, + "loss": 0.3645, + "num_input_tokens_seen": 8260960, + "step": 12670 + }, + { + "epoch": 7.473466981132075, + "grad_norm": 4.372264862060547, + "learning_rate": 7.887238461501671e-06, + "loss": 0.4477, + "num_input_tokens_seen": 8263360, + "step": 12675 + }, + { + "epoch": 7.476415094339623, + "grad_norm": 1.862304449081421, + "learning_rate": 7.885137644368654e-06, + "loss": 0.294, + "num_input_tokens_seen": 8266016, + "step": 12680 + }, + { + "epoch": 7.47936320754717, + "grad_norm": 2.1847269535064697, + "learning_rate": 7.883036063383012e-06, + "loss": 0.4599, + "num_input_tokens_seen": 8268640, + "step": 12685 + }, + { + "epoch": 7.482311320754717, + "grad_norm": 3.9076075553894043, + "learning_rate": 7.880933719101148e-06, + "loss": 0.3729, + "num_input_tokens_seen": 8271456, + "step": 12690 + }, + { + "epoch": 7.4852594339622645, + "grad_norm": 2.3612866401672363, + "learning_rate": 7.878830612079664e-06, + "loss": 0.381, + "num_input_tokens_seen": 8274656, + "step": 12695 + }, + { + "epoch": 7.488207547169811, + "grad_norm": 5.0508131980896, + "learning_rate": 7.876726742875369e-06, + "loss": 0.4714, + "num_input_tokens_seen": 8277472, + "step": 12700 + }, + { + "epoch": 7.491155660377358, + "grad_norm": 3.3084397315979004, + "learning_rate": 7.874622112045269e-06, + "loss": 0.4641, + "num_input_tokens_seen": 8280800, + "step": 12705 + }, + { + "epoch": 7.494103773584905, + "grad_norm": 2.756094455718994, + "learning_rate": 7.872516720146578e-06, + "loss": 0.3916, + "num_input_tokens_seen": 8283712, + "step": 12710 + }, + { + "epoch": 7.497051886792453, + "grad_norm": 1.5952520370483398, + "learning_rate": 7.870410567736705e-06, + "loss": 0.37, + "num_input_tokens_seen": 8287872, + "step": 12715 + }, + { + "epoch": 7.5, + "grad_norm": 1.5211774110794067, + "learning_rate": 7.868303655373264e-06, + "loss": 0.3059, + "num_input_tokens_seen": 8291328, + "step": 12720 + }, + { + "epoch": 7.502948113207547, + "grad_norm": 2.261403799057007, + "learning_rate": 7.866195983614066e-06, + "loss": 0.5107, + "num_input_tokens_seen": 8293920, + "step": 12725 + }, + { + "epoch": 7.505896226415095, + "grad_norm": 4.688958168029785, + "learning_rate": 7.864087553017133e-06, + "loss": 0.5651, + "num_input_tokens_seen": 8296704, + "step": 12730 + }, + { + "epoch": 7.508844339622642, + "grad_norm": 3.901465892791748, + "learning_rate": 7.861978364140674e-06, + "loss": 0.4373, + "num_input_tokens_seen": 8301792, + "step": 12735 + }, + { + "epoch": 7.511792452830189, + "grad_norm": 2.922650098800659, + "learning_rate": 7.859868417543109e-06, + "loss": 0.4092, + "num_input_tokens_seen": 8305312, + "step": 12740 + }, + { + "epoch": 7.5147405660377355, + "grad_norm": 1.942541241645813, + "learning_rate": 7.857757713783055e-06, + "loss": 0.3922, + "num_input_tokens_seen": 8308832, + "step": 12745 + }, + { + "epoch": 7.517688679245283, + "grad_norm": 2.7667229175567627, + "learning_rate": 7.855646253419331e-06, + "loss": 0.3832, + "num_input_tokens_seen": 8311200, + "step": 12750 + }, + { + "epoch": 7.52063679245283, + "grad_norm": 1.7393347024917603, + "learning_rate": 7.853534037010952e-06, + "loss": 0.4667, + "num_input_tokens_seen": 8313952, + "step": 12755 + }, + { + "epoch": 7.523584905660377, + "grad_norm": 2.5969932079315186, + "learning_rate": 7.851421065117142e-06, + "loss": 0.4567, + "num_input_tokens_seen": 8317024, + "step": 12760 + }, + { + "epoch": 7.526533018867925, + "grad_norm": 3.351426601409912, + "learning_rate": 7.849307338297314e-06, + "loss": 0.382, + "num_input_tokens_seen": 8319648, + "step": 12765 + }, + { + "epoch": 7.529481132075472, + "grad_norm": 1.6915512084960938, + "learning_rate": 7.847192857111087e-06, + "loss": 0.3355, + "num_input_tokens_seen": 8322848, + "step": 12770 + }, + { + "epoch": 7.532429245283019, + "grad_norm": 2.4099693298339844, + "learning_rate": 7.845077622118282e-06, + "loss": 0.5652, + "num_input_tokens_seen": 8325664, + "step": 12775 + }, + { + "epoch": 7.535377358490566, + "grad_norm": 1.341578483581543, + "learning_rate": 7.842961633878916e-06, + "loss": 0.544, + "num_input_tokens_seen": 8330144, + "step": 12780 + }, + { + "epoch": 7.538325471698113, + "grad_norm": 2.356422185897827, + "learning_rate": 7.840844892953204e-06, + "loss": 0.4593, + "num_input_tokens_seen": 8333344, + "step": 12785 + }, + { + "epoch": 7.54127358490566, + "grad_norm": 3.681206703186035, + "learning_rate": 7.838727399901562e-06, + "loss": 0.4677, + "num_input_tokens_seen": 8337920, + "step": 12790 + }, + { + "epoch": 7.5442216981132075, + "grad_norm": 1.7842581272125244, + "learning_rate": 7.836609155284607e-06, + "loss": 0.399, + "num_input_tokens_seen": 8340704, + "step": 12795 + }, + { + "epoch": 7.547169811320755, + "grad_norm": 3.0184223651885986, + "learning_rate": 7.834490159663154e-06, + "loss": 0.3755, + "num_input_tokens_seen": 8343872, + "step": 12800 + }, + { + "epoch": 7.550117924528302, + "grad_norm": 3.3758602142333984, + "learning_rate": 7.832370413598215e-06, + "loss": 0.3324, + "num_input_tokens_seen": 8347456, + "step": 12805 + }, + { + "epoch": 7.553066037735849, + "grad_norm": 1.961235761642456, + "learning_rate": 7.830249917651003e-06, + "loss": 0.5419, + "num_input_tokens_seen": 8350688, + "step": 12810 + }, + { + "epoch": 7.556014150943396, + "grad_norm": 2.4225265979766846, + "learning_rate": 7.828128672382926e-06, + "loss": 0.4354, + "num_input_tokens_seen": 8353920, + "step": 12815 + }, + { + "epoch": 7.558962264150943, + "grad_norm": 1.5328577756881714, + "learning_rate": 7.826006678355596e-06, + "loss": 0.3481, + "num_input_tokens_seen": 8357024, + "step": 12820 + }, + { + "epoch": 7.56191037735849, + "grad_norm": 2.3475513458251953, + "learning_rate": 7.823883936130817e-06, + "loss": 0.3697, + "num_input_tokens_seen": 8360160, + "step": 12825 + }, + { + "epoch": 7.564858490566038, + "grad_norm": 2.133432626724243, + "learning_rate": 7.821760446270597e-06, + "loss": 0.3563, + "num_input_tokens_seen": 8362976, + "step": 12830 + }, + { + "epoch": 7.567806603773585, + "grad_norm": 1.855751395225525, + "learning_rate": 7.819636209337136e-06, + "loss": 0.3888, + "num_input_tokens_seen": 8365888, + "step": 12835 + }, + { + "epoch": 7.570754716981132, + "grad_norm": 1.731449842453003, + "learning_rate": 7.817511225892838e-06, + "loss": 0.4236, + "num_input_tokens_seen": 8369120, + "step": 12840 + }, + { + "epoch": 7.5737028301886795, + "grad_norm": 2.062988519668579, + "learning_rate": 7.8153854965003e-06, + "loss": 0.4603, + "num_input_tokens_seen": 8373376, + "step": 12845 + }, + { + "epoch": 7.576650943396227, + "grad_norm": 2.6432371139526367, + "learning_rate": 7.813259021722319e-06, + "loss": 0.3501, + "num_input_tokens_seen": 8375808, + "step": 12850 + }, + { + "epoch": 7.579599056603773, + "grad_norm": 2.4975013732910156, + "learning_rate": 7.811131802121885e-06, + "loss": 0.5657, + "num_input_tokens_seen": 8378720, + "step": 12855 + }, + { + "epoch": 7.5825471698113205, + "grad_norm": 2.2664835453033447, + "learning_rate": 7.809003838262193e-06, + "loss": 0.3197, + "num_input_tokens_seen": 8381728, + "step": 12860 + }, + { + "epoch": 7.585495283018868, + "grad_norm": 2.14251446723938, + "learning_rate": 7.806875130706628e-06, + "loss": 0.4422, + "num_input_tokens_seen": 8384480, + "step": 12865 + }, + { + "epoch": 7.588443396226415, + "grad_norm": 3.790902614593506, + "learning_rate": 7.804745680018775e-06, + "loss": 0.3221, + "num_input_tokens_seen": 8387104, + "step": 12870 + }, + { + "epoch": 7.591391509433962, + "grad_norm": 1.4395138025283813, + "learning_rate": 7.802615486762418e-06, + "loss": 0.3857, + "num_input_tokens_seen": 8390560, + "step": 12875 + }, + { + "epoch": 7.59433962264151, + "grad_norm": 2.2490885257720947, + "learning_rate": 7.800484551501528e-06, + "loss": 0.4363, + "num_input_tokens_seen": 8393472, + "step": 12880 + }, + { + "epoch": 7.597287735849057, + "grad_norm": 2.8760266304016113, + "learning_rate": 7.798352874800285e-06, + "loss": 0.4374, + "num_input_tokens_seen": 8396416, + "step": 12885 + }, + { + "epoch": 7.600235849056604, + "grad_norm": 1.5365777015686035, + "learning_rate": 7.79622045722306e-06, + "loss": 0.3626, + "num_input_tokens_seen": 8400096, + "step": 12890 + }, + { + "epoch": 7.603183962264151, + "grad_norm": 4.174517631530762, + "learning_rate": 7.794087299334416e-06, + "loss": 0.3912, + "num_input_tokens_seen": 8403552, + "step": 12895 + }, + { + "epoch": 7.606132075471698, + "grad_norm": 1.8114674091339111, + "learning_rate": 7.79195340169912e-06, + "loss": 0.4015, + "num_input_tokens_seen": 8407648, + "step": 12900 + }, + { + "epoch": 7.609080188679245, + "grad_norm": 1.6811645030975342, + "learning_rate": 7.789818764882127e-06, + "loss": 0.471, + "num_input_tokens_seen": 8410720, + "step": 12905 + }, + { + "epoch": 7.6120283018867925, + "grad_norm": 2.1953039169311523, + "learning_rate": 7.78768338944859e-06, + "loss": 0.31, + "num_input_tokens_seen": 8413408, + "step": 12910 + }, + { + "epoch": 7.61497641509434, + "grad_norm": 1.9849573373794556, + "learning_rate": 7.785547275963865e-06, + "loss": 0.3788, + "num_input_tokens_seen": 8416768, + "step": 12915 + }, + { + "epoch": 7.617924528301887, + "grad_norm": 2.6337649822235107, + "learning_rate": 7.783410424993492e-06, + "loss": 0.3913, + "num_input_tokens_seen": 8419072, + "step": 12920 + }, + { + "epoch": 7.620872641509434, + "grad_norm": 2.12776255607605, + "learning_rate": 7.781272837103213e-06, + "loss": 0.4123, + "num_input_tokens_seen": 8421728, + "step": 12925 + }, + { + "epoch": 7.623820754716981, + "grad_norm": 2.0554819107055664, + "learning_rate": 7.779134512858964e-06, + "loss": 0.3753, + "num_input_tokens_seen": 8425184, + "step": 12930 + }, + { + "epoch": 7.626768867924528, + "grad_norm": 8.579044342041016, + "learning_rate": 7.776995452826876e-06, + "loss": 0.4383, + "num_input_tokens_seen": 8428256, + "step": 12935 + }, + { + "epoch": 7.629716981132075, + "grad_norm": 2.6147043704986572, + "learning_rate": 7.774855657573274e-06, + "loss": 0.4513, + "num_input_tokens_seen": 8431872, + "step": 12940 + }, + { + "epoch": 7.632665094339623, + "grad_norm": 1.7213010787963867, + "learning_rate": 7.772715127664676e-06, + "loss": 0.3603, + "num_input_tokens_seen": 8434656, + "step": 12945 + }, + { + "epoch": 7.63561320754717, + "grad_norm": 3.4233412742614746, + "learning_rate": 7.7705738636678e-06, + "loss": 0.3734, + "num_input_tokens_seen": 8437504, + "step": 12950 + }, + { + "epoch": 7.638561320754717, + "grad_norm": 1.9577927589416504, + "learning_rate": 7.768431866149552e-06, + "loss": 0.3585, + "num_input_tokens_seen": 8440960, + "step": 12955 + }, + { + "epoch": 7.6415094339622645, + "grad_norm": 2.636037588119507, + "learning_rate": 7.766289135677035e-06, + "loss": 0.4011, + "num_input_tokens_seen": 8444128, + "step": 12960 + }, + { + "epoch": 7.644457547169811, + "grad_norm": 1.8809701204299927, + "learning_rate": 7.764145672817549e-06, + "loss": 0.498, + "num_input_tokens_seen": 8448192, + "step": 12965 + }, + { + "epoch": 7.647405660377358, + "grad_norm": 2.8841614723205566, + "learning_rate": 7.762001478138583e-06, + "loss": 0.4189, + "num_input_tokens_seen": 8451744, + "step": 12970 + }, + { + "epoch": 7.650353773584905, + "grad_norm": 3.031399965286255, + "learning_rate": 7.759856552207822e-06, + "loss": 0.4304, + "num_input_tokens_seen": 8455040, + "step": 12975 + }, + { + "epoch": 7.653301886792453, + "grad_norm": 4.450901508331299, + "learning_rate": 7.757710895593144e-06, + "loss": 0.3564, + "num_input_tokens_seen": 8457792, + "step": 12980 + }, + { + "epoch": 7.65625, + "grad_norm": 2.7953264713287354, + "learning_rate": 7.755564508862623e-06, + "loss": 0.4801, + "num_input_tokens_seen": 8461440, + "step": 12985 + }, + { + "epoch": 7.659198113207547, + "grad_norm": 2.5081403255462646, + "learning_rate": 7.753417392584522e-06, + "loss": 0.527, + "num_input_tokens_seen": 8464576, + "step": 12990 + }, + { + "epoch": 7.662146226415095, + "grad_norm": 2.059462070465088, + "learning_rate": 7.751269547327298e-06, + "loss": 0.301, + "num_input_tokens_seen": 8468160, + "step": 12995 + }, + { + "epoch": 7.665094339622642, + "grad_norm": 2.0491855144500732, + "learning_rate": 7.749120973659606e-06, + "loss": 0.3607, + "num_input_tokens_seen": 8471296, + "step": 13000 + }, + { + "epoch": 7.668042452830189, + "grad_norm": 2.485041379928589, + "learning_rate": 7.746971672150286e-06, + "loss": 0.3674, + "num_input_tokens_seen": 8473728, + "step": 13005 + }, + { + "epoch": 7.6709905660377355, + "grad_norm": 3.1727583408355713, + "learning_rate": 7.74482164336838e-06, + "loss": 0.5391, + "num_input_tokens_seen": 8477600, + "step": 13010 + }, + { + "epoch": 7.673938679245283, + "grad_norm": 1.7440383434295654, + "learning_rate": 7.742670887883111e-06, + "loss": 0.5365, + "num_input_tokens_seen": 8480704, + "step": 13015 + }, + { + "epoch": 7.67688679245283, + "grad_norm": 2.0993595123291016, + "learning_rate": 7.740519406263905e-06, + "loss": 0.3469, + "num_input_tokens_seen": 8483872, + "step": 13020 + }, + { + "epoch": 7.679834905660377, + "grad_norm": 3.6898202896118164, + "learning_rate": 7.738367199080376e-06, + "loss": 0.2866, + "num_input_tokens_seen": 8486848, + "step": 13025 + }, + { + "epoch": 7.682783018867925, + "grad_norm": 2.1749861240386963, + "learning_rate": 7.73621426690233e-06, + "loss": 0.57, + "num_input_tokens_seen": 8489536, + "step": 13030 + }, + { + "epoch": 7.685731132075472, + "grad_norm": 8.308359146118164, + "learning_rate": 7.734060610299764e-06, + "loss": 0.4558, + "num_input_tokens_seen": 8492320, + "step": 13035 + }, + { + "epoch": 7.688679245283019, + "grad_norm": 2.591728687286377, + "learning_rate": 7.731906229842869e-06, + "loss": 0.5196, + "num_input_tokens_seen": 8495328, + "step": 13040 + }, + { + "epoch": 7.691627358490566, + "grad_norm": 2.6472785472869873, + "learning_rate": 7.729751126102023e-06, + "loss": 0.4421, + "num_input_tokens_seen": 8498368, + "step": 13045 + }, + { + "epoch": 7.694575471698113, + "grad_norm": 1.0644233226776123, + "learning_rate": 7.727595299647805e-06, + "loss": 0.3836, + "num_input_tokens_seen": 8501632, + "step": 13050 + }, + { + "epoch": 7.69752358490566, + "grad_norm": 1.721089243888855, + "learning_rate": 7.725438751050973e-06, + "loss": 0.3607, + "num_input_tokens_seen": 8504800, + "step": 13055 + }, + { + "epoch": 7.7004716981132075, + "grad_norm": 2.9734160900115967, + "learning_rate": 7.723281480882489e-06, + "loss": 0.5088, + "num_input_tokens_seen": 8507264, + "step": 13060 + }, + { + "epoch": 7.703419811320755, + "grad_norm": 4.0242600440979, + "learning_rate": 7.721123489713494e-06, + "loss": 0.3502, + "num_input_tokens_seen": 8511040, + "step": 13065 + }, + { + "epoch": 7.706367924528302, + "grad_norm": 3.9850783348083496, + "learning_rate": 7.718964778115328e-06, + "loss": 0.394, + "num_input_tokens_seen": 8513952, + "step": 13070 + }, + { + "epoch": 7.709316037735849, + "grad_norm": 1.89198899269104, + "learning_rate": 7.716805346659519e-06, + "loss": 0.4228, + "num_input_tokens_seen": 8517344, + "step": 13075 + }, + { + "epoch": 7.712264150943396, + "grad_norm": 3.7127890586853027, + "learning_rate": 7.714645195917788e-06, + "loss": 0.4467, + "num_input_tokens_seen": 8520448, + "step": 13080 + }, + { + "epoch": 7.715212264150943, + "grad_norm": 2.149409532546997, + "learning_rate": 7.712484326462038e-06, + "loss": 0.4963, + "num_input_tokens_seen": 8523328, + "step": 13085 + }, + { + "epoch": 7.71816037735849, + "grad_norm": 1.1930314302444458, + "learning_rate": 7.710322738864375e-06, + "loss": 0.3976, + "num_input_tokens_seen": 8526816, + "step": 13090 + }, + { + "epoch": 7.721108490566038, + "grad_norm": 3.9999423027038574, + "learning_rate": 7.708160433697085e-06, + "loss": 0.4499, + "num_input_tokens_seen": 8529504, + "step": 13095 + }, + { + "epoch": 7.724056603773585, + "grad_norm": 1.6784738302230835, + "learning_rate": 7.705997411532649e-06, + "loss": 0.3348, + "num_input_tokens_seen": 8533056, + "step": 13100 + }, + { + "epoch": 7.727004716981132, + "grad_norm": 2.267273426055908, + "learning_rate": 7.703833672943735e-06, + "loss": 0.4559, + "num_input_tokens_seen": 8536288, + "step": 13105 + }, + { + "epoch": 7.7299528301886795, + "grad_norm": 2.2425405979156494, + "learning_rate": 7.701669218503206e-06, + "loss": 0.4687, + "num_input_tokens_seen": 8538880, + "step": 13110 + }, + { + "epoch": 7.732900943396227, + "grad_norm": 3.9774842262268066, + "learning_rate": 7.699504048784106e-06, + "loss": 0.402, + "num_input_tokens_seen": 8544736, + "step": 13115 + }, + { + "epoch": 7.735849056603773, + "grad_norm": 2.8074254989624023, + "learning_rate": 7.697338164359675e-06, + "loss": 0.4157, + "num_input_tokens_seen": 8547136, + "step": 13120 + }, + { + "epoch": 7.7387971698113205, + "grad_norm": 2.6301400661468506, + "learning_rate": 7.69517156580334e-06, + "loss": 0.3464, + "num_input_tokens_seen": 8549984, + "step": 13125 + }, + { + "epoch": 7.741745283018868, + "grad_norm": 2.555373191833496, + "learning_rate": 7.693004253688716e-06, + "loss": 0.4093, + "num_input_tokens_seen": 8553792, + "step": 13130 + }, + { + "epoch": 7.744693396226415, + "grad_norm": 1.6086993217468262, + "learning_rate": 7.690836228589613e-06, + "loss": 0.3818, + "num_input_tokens_seen": 8556480, + "step": 13135 + }, + { + "epoch": 7.747641509433962, + "grad_norm": 3.822707176208496, + "learning_rate": 7.688667491080019e-06, + "loss": 0.4145, + "num_input_tokens_seen": 8559616, + "step": 13140 + }, + { + "epoch": 7.75058962264151, + "grad_norm": 2.938126802444458, + "learning_rate": 7.686498041734121e-06, + "loss": 0.4131, + "num_input_tokens_seen": 8562400, + "step": 13145 + }, + { + "epoch": 7.753537735849057, + "grad_norm": 2.0569612979888916, + "learning_rate": 7.684327881126285e-06, + "loss": 0.4227, + "num_input_tokens_seen": 8566336, + "step": 13150 + }, + { + "epoch": 7.756485849056604, + "grad_norm": 1.5294172763824463, + "learning_rate": 7.682157009831078e-06, + "loss": 0.4486, + "num_input_tokens_seen": 8570976, + "step": 13155 + }, + { + "epoch": 7.759433962264151, + "grad_norm": 2.7983970642089844, + "learning_rate": 7.67998542842324e-06, + "loss": 0.4101, + "num_input_tokens_seen": 8574176, + "step": 13160 + }, + { + "epoch": 7.762382075471698, + "grad_norm": 1.6997811794281006, + "learning_rate": 7.677813137477711e-06, + "loss": 0.4454, + "num_input_tokens_seen": 8577120, + "step": 13165 + }, + { + "epoch": 7.765330188679245, + "grad_norm": 2.4485061168670654, + "learning_rate": 7.675640137569614e-06, + "loss": 0.441, + "num_input_tokens_seen": 8579936, + "step": 13170 + }, + { + "epoch": 7.7682783018867925, + "grad_norm": 2.12693190574646, + "learning_rate": 7.673466429274257e-06, + "loss": 0.5256, + "num_input_tokens_seen": 8582304, + "step": 13175 + }, + { + "epoch": 7.77122641509434, + "grad_norm": 1.5442661046981812, + "learning_rate": 7.671292013167143e-06, + "loss": 0.3135, + "num_input_tokens_seen": 8585184, + "step": 13180 + }, + { + "epoch": 7.774174528301887, + "grad_norm": 1.6561390161514282, + "learning_rate": 7.669116889823955e-06, + "loss": 0.4471, + "num_input_tokens_seen": 8588128, + "step": 13185 + }, + { + "epoch": 7.777122641509434, + "grad_norm": 1.3347402811050415, + "learning_rate": 7.666941059820567e-06, + "loss": 0.3611, + "num_input_tokens_seen": 8591808, + "step": 13190 + }, + { + "epoch": 7.780070754716981, + "grad_norm": 3.6079633235931396, + "learning_rate": 7.66476452373304e-06, + "loss": 0.4664, + "num_input_tokens_seen": 8594752, + "step": 13195 + }, + { + "epoch": 7.783018867924528, + "grad_norm": 1.9773160219192505, + "learning_rate": 7.66258728213762e-06, + "loss": 0.3859, + "num_input_tokens_seen": 8598304, + "step": 13200 + }, + { + "epoch": 7.785966981132075, + "grad_norm": 3.327244520187378, + "learning_rate": 7.66040933561074e-06, + "loss": 0.3913, + "num_input_tokens_seen": 8600992, + "step": 13205 + }, + { + "epoch": 7.788915094339623, + "grad_norm": 2.380021810531616, + "learning_rate": 7.658230684729027e-06, + "loss": 0.4422, + "num_input_tokens_seen": 8608192, + "step": 13210 + }, + { + "epoch": 7.79186320754717, + "grad_norm": 1.989427924156189, + "learning_rate": 7.656051330069282e-06, + "loss": 0.3475, + "num_input_tokens_seen": 8612576, + "step": 13215 + }, + { + "epoch": 7.794811320754717, + "grad_norm": 3.6624433994293213, + "learning_rate": 7.6538712722085e-06, + "loss": 0.4225, + "num_input_tokens_seen": 8615680, + "step": 13220 + }, + { + "epoch": 7.7977594339622645, + "grad_norm": 2.0149941444396973, + "learning_rate": 7.651690511723862e-06, + "loss": 0.3988, + "num_input_tokens_seen": 8618080, + "step": 13225 + }, + { + "epoch": 7.800707547169811, + "grad_norm": 1.566019892692566, + "learning_rate": 7.64950904919273e-06, + "loss": 0.2902, + "num_input_tokens_seen": 8620608, + "step": 13230 + }, + { + "epoch": 7.803655660377358, + "grad_norm": 2.344214677810669, + "learning_rate": 7.647326885192662e-06, + "loss": 0.5519, + "num_input_tokens_seen": 8623424, + "step": 13235 + }, + { + "epoch": 7.806603773584905, + "grad_norm": 2.9266605377197266, + "learning_rate": 7.645144020301392e-06, + "loss": 0.5043, + "num_input_tokens_seen": 8627360, + "step": 13240 + }, + { + "epoch": 7.809551886792453, + "grad_norm": 2.5658066272735596, + "learning_rate": 7.64296045509684e-06, + "loss": 0.4525, + "num_input_tokens_seen": 8629888, + "step": 13245 + }, + { + "epoch": 7.8125, + "grad_norm": 2.396303176879883, + "learning_rate": 7.64077619015712e-06, + "loss": 0.5715, + "num_input_tokens_seen": 8633408, + "step": 13250 + }, + { + "epoch": 7.815448113207547, + "grad_norm": 2.0013973712921143, + "learning_rate": 7.638591226060519e-06, + "loss": 0.4835, + "num_input_tokens_seen": 8636512, + "step": 13255 + }, + { + "epoch": 7.818396226415095, + "grad_norm": 3.196708917617798, + "learning_rate": 7.636405563385522e-06, + "loss": 0.4835, + "num_input_tokens_seen": 8640320, + "step": 13260 + }, + { + "epoch": 7.821344339622642, + "grad_norm": 1.9235656261444092, + "learning_rate": 7.634219202710789e-06, + "loss": 0.5227, + "num_input_tokens_seen": 8643872, + "step": 13265 + }, + { + "epoch": 7.824292452830189, + "grad_norm": 1.7198208570480347, + "learning_rate": 7.632032144615168e-06, + "loss": 0.2569, + "num_input_tokens_seen": 8646880, + "step": 13270 + }, + { + "epoch": 7.8272405660377355, + "grad_norm": 2.324612617492676, + "learning_rate": 7.629844389677695e-06, + "loss": 0.4049, + "num_input_tokens_seen": 8649568, + "step": 13275 + }, + { + "epoch": 7.830188679245283, + "grad_norm": 2.9373276233673096, + "learning_rate": 7.627655938477586e-06, + "loss": 0.3216, + "num_input_tokens_seen": 8652672, + "step": 13280 + }, + { + "epoch": 7.83313679245283, + "grad_norm": 1.9209688901901245, + "learning_rate": 7.6254667915942415e-06, + "loss": 0.3919, + "num_input_tokens_seen": 8656192, + "step": 13285 + }, + { + "epoch": 7.836084905660377, + "grad_norm": 1.7439285516738892, + "learning_rate": 7.62327694960725e-06, + "loss": 0.455, + "num_input_tokens_seen": 8659776, + "step": 13290 + }, + { + "epoch": 7.839033018867925, + "grad_norm": 3.3810596466064453, + "learning_rate": 7.621086413096379e-06, + "loss": 0.4131, + "num_input_tokens_seen": 8662688, + "step": 13295 + }, + { + "epoch": 7.841981132075472, + "grad_norm": 4.054974555969238, + "learning_rate": 7.618895182641584e-06, + "loss": 0.3886, + "num_input_tokens_seen": 8665760, + "step": 13300 + }, + { + "epoch": 7.844929245283019, + "grad_norm": 2.0524401664733887, + "learning_rate": 7.6167032588230035e-06, + "loss": 0.5656, + "num_input_tokens_seen": 8669280, + "step": 13305 + }, + { + "epoch": 7.847877358490566, + "grad_norm": 2.329983711242676, + "learning_rate": 7.614510642220958e-06, + "loss": 0.3872, + "num_input_tokens_seen": 8673184, + "step": 13310 + }, + { + "epoch": 7.850825471698113, + "grad_norm": 1.6023428440093994, + "learning_rate": 7.612317333415951e-06, + "loss": 0.3348, + "num_input_tokens_seen": 8676640, + "step": 13315 + }, + { + "epoch": 7.85377358490566, + "grad_norm": 3.4624242782592773, + "learning_rate": 7.610123332988673e-06, + "loss": 0.5106, + "num_input_tokens_seen": 8679744, + "step": 13320 + }, + { + "epoch": 7.8567216981132075, + "grad_norm": 1.1404262781143188, + "learning_rate": 7.607928641519992e-06, + "loss": 0.3815, + "num_input_tokens_seen": 8682752, + "step": 13325 + }, + { + "epoch": 7.859669811320755, + "grad_norm": 2.5558249950408936, + "learning_rate": 7.605733259590964e-06, + "loss": 0.4225, + "num_input_tokens_seen": 8685472, + "step": 13330 + }, + { + "epoch": 7.862617924528302, + "grad_norm": 1.731750249862671, + "learning_rate": 7.603537187782826e-06, + "loss": 0.3393, + "num_input_tokens_seen": 8689632, + "step": 13335 + }, + { + "epoch": 7.865566037735849, + "grad_norm": 2.082778215408325, + "learning_rate": 7.601340426676996e-06, + "loss": 0.4447, + "num_input_tokens_seen": 8693440, + "step": 13340 + }, + { + "epoch": 7.868514150943396, + "grad_norm": 2.76826810836792, + "learning_rate": 7.599142976855077e-06, + "loss": 0.3355, + "num_input_tokens_seen": 8696960, + "step": 13345 + }, + { + "epoch": 7.871462264150943, + "grad_norm": 7.586110591888428, + "learning_rate": 7.596944838898854e-06, + "loss": 0.3982, + "num_input_tokens_seen": 8700864, + "step": 13350 + }, + { + "epoch": 7.87441037735849, + "grad_norm": 3.6489715576171875, + "learning_rate": 7.594746013390293e-06, + "loss": 0.432, + "num_input_tokens_seen": 8703904, + "step": 13355 + }, + { + "epoch": 7.877358490566038, + "grad_norm": 1.4688109159469604, + "learning_rate": 7.59254650091154e-06, + "loss": 0.4019, + "num_input_tokens_seen": 8707552, + "step": 13360 + }, + { + "epoch": 7.880306603773585, + "grad_norm": 2.336695909500122, + "learning_rate": 7.59034630204493e-06, + "loss": 0.3984, + "num_input_tokens_seen": 8710464, + "step": 13365 + }, + { + "epoch": 7.883254716981132, + "grad_norm": 2.1053175926208496, + "learning_rate": 7.588145417372972e-06, + "loss": 0.3975, + "num_input_tokens_seen": 8713216, + "step": 13370 + }, + { + "epoch": 7.8862028301886795, + "grad_norm": 4.455665111541748, + "learning_rate": 7.585943847478361e-06, + "loss": 0.4141, + "num_input_tokens_seen": 8716512, + "step": 13375 + }, + { + "epoch": 7.889150943396227, + "grad_norm": 2.159214735031128, + "learning_rate": 7.583741592943971e-06, + "loss": 0.3418, + "num_input_tokens_seen": 8719840, + "step": 13380 + }, + { + "epoch": 7.892099056603773, + "grad_norm": 1.2077945470809937, + "learning_rate": 7.581538654352859e-06, + "loss": 0.3369, + "num_input_tokens_seen": 8723296, + "step": 13385 + }, + { + "epoch": 7.8950471698113205, + "grad_norm": 1.9655312299728394, + "learning_rate": 7.579335032288262e-06, + "loss": 0.4314, + "num_input_tokens_seen": 8726272, + "step": 13390 + }, + { + "epoch": 7.897995283018868, + "grad_norm": 2.34226655960083, + "learning_rate": 7.577130727333598e-06, + "loss": 0.3827, + "num_input_tokens_seen": 8729216, + "step": 13395 + }, + { + "epoch": 7.900943396226415, + "grad_norm": 4.47019624710083, + "learning_rate": 7.5749257400724695e-06, + "loss": 0.4882, + "num_input_tokens_seen": 8731648, + "step": 13400 + }, + { + "epoch": 7.903891509433962, + "grad_norm": 1.927394986152649, + "learning_rate": 7.572720071088653e-06, + "loss": 0.4285, + "num_input_tokens_seen": 8735104, + "step": 13405 + }, + { + "epoch": 7.90683962264151, + "grad_norm": 2.8650500774383545, + "learning_rate": 7.570513720966108e-06, + "loss": 0.4292, + "num_input_tokens_seen": 8738432, + "step": 13410 + }, + { + "epoch": 7.909787735849057, + "grad_norm": 2.021859645843506, + "learning_rate": 7.56830669028898e-06, + "loss": 0.3657, + "num_input_tokens_seen": 8742080, + "step": 13415 + }, + { + "epoch": 7.912735849056604, + "grad_norm": 2.4289166927337646, + "learning_rate": 7.566098979641588e-06, + "loss": 0.4759, + "num_input_tokens_seen": 8745120, + "step": 13420 + }, + { + "epoch": 7.915683962264151, + "grad_norm": 4.298847198486328, + "learning_rate": 7.563890589608427e-06, + "loss": 0.4181, + "num_input_tokens_seen": 8750592, + "step": 13425 + }, + { + "epoch": 7.918632075471698, + "grad_norm": 1.7805455923080444, + "learning_rate": 7.561681520774187e-06, + "loss": 0.4343, + "num_input_tokens_seen": 8754432, + "step": 13430 + }, + { + "epoch": 7.921580188679245, + "grad_norm": 1.4597715139389038, + "learning_rate": 7.559471773723721e-06, + "loss": 0.5774, + "num_input_tokens_seen": 8758016, + "step": 13435 + }, + { + "epoch": 7.9245283018867925, + "grad_norm": 1.4705971479415894, + "learning_rate": 7.557261349042073e-06, + "loss": 0.3994, + "num_input_tokens_seen": 8760544, + "step": 13440 + }, + { + "epoch": 7.92747641509434, + "grad_norm": 2.225756883621216, + "learning_rate": 7.555050247314464e-06, + "loss": 0.4744, + "num_input_tokens_seen": 8765792, + "step": 13445 + }, + { + "epoch": 7.930424528301887, + "grad_norm": 2.0553815364837646, + "learning_rate": 7.552838469126289e-06, + "loss": 0.3427, + "num_input_tokens_seen": 8768608, + "step": 13450 + }, + { + "epoch": 7.933372641509434, + "grad_norm": 1.825537085533142, + "learning_rate": 7.550626015063125e-06, + "loss": 0.3828, + "num_input_tokens_seen": 8771680, + "step": 13455 + }, + { + "epoch": 7.936320754716981, + "grad_norm": 2.6737914085388184, + "learning_rate": 7.548412885710734e-06, + "loss": 0.3649, + "num_input_tokens_seen": 8774720, + "step": 13460 + }, + { + "epoch": 7.939268867924528, + "grad_norm": 1.4219367504119873, + "learning_rate": 7.546199081655048e-06, + "loss": 0.3616, + "num_input_tokens_seen": 8778080, + "step": 13465 + }, + { + "epoch": 7.942216981132075, + "grad_norm": 2.9162654876708984, + "learning_rate": 7.54398460348218e-06, + "loss": 0.5237, + "num_input_tokens_seen": 8781568, + "step": 13470 + }, + { + "epoch": 7.945165094339623, + "grad_norm": 3.0545060634613037, + "learning_rate": 7.541769451778425e-06, + "loss": 0.4307, + "num_input_tokens_seen": 8784768, + "step": 13475 + }, + { + "epoch": 7.94811320754717, + "grad_norm": 3.1044540405273438, + "learning_rate": 7.5395536271302536e-06, + "loss": 0.3514, + "num_input_tokens_seen": 8788032, + "step": 13480 + }, + { + "epoch": 7.951061320754717, + "grad_norm": 1.6437549591064453, + "learning_rate": 7.5373371301243136e-06, + "loss": 0.3308, + "num_input_tokens_seen": 8793408, + "step": 13485 + }, + { + "epoch": 7.9540094339622645, + "grad_norm": 1.6758924722671509, + "learning_rate": 7.535119961347433e-06, + "loss": 0.3728, + "num_input_tokens_seen": 8796704, + "step": 13490 + }, + { + "epoch": 7.956957547169811, + "grad_norm": 2.367798089981079, + "learning_rate": 7.532902121386618e-06, + "loss": 0.3715, + "num_input_tokens_seen": 8800448, + "step": 13495 + }, + { + "epoch": 7.959905660377358, + "grad_norm": 3.0903031826019287, + "learning_rate": 7.530683610829051e-06, + "loss": 0.5002, + "num_input_tokens_seen": 8803296, + "step": 13500 + }, + { + "epoch": 7.962853773584905, + "grad_norm": 2.4797983169555664, + "learning_rate": 7.5284644302620906e-06, + "loss": 0.4646, + "num_input_tokens_seen": 8805984, + "step": 13505 + }, + { + "epoch": 7.965801886792453, + "grad_norm": 2.400684118270874, + "learning_rate": 7.526244580273274e-06, + "loss": 0.5778, + "num_input_tokens_seen": 8811072, + "step": 13510 + }, + { + "epoch": 7.96875, + "grad_norm": 3.4902589321136475, + "learning_rate": 7.524024061450318e-06, + "loss": 0.3653, + "num_input_tokens_seen": 8814144, + "step": 13515 + }, + { + "epoch": 7.971698113207547, + "grad_norm": 2.744936466217041, + "learning_rate": 7.521802874381115e-06, + "loss": 0.4454, + "num_input_tokens_seen": 8816928, + "step": 13520 + }, + { + "epoch": 7.974646226415095, + "grad_norm": 2.6063666343688965, + "learning_rate": 7.519581019653731e-06, + "loss": 0.4588, + "num_input_tokens_seen": 8819168, + "step": 13525 + }, + { + "epoch": 7.977594339622642, + "grad_norm": 1.2602781057357788, + "learning_rate": 7.517358497856413e-06, + "loss": 0.4142, + "num_input_tokens_seen": 8822688, + "step": 13530 + }, + { + "epoch": 7.980542452830189, + "grad_norm": 2.7582015991210938, + "learning_rate": 7.515135309577584e-06, + "loss": 0.3912, + "num_input_tokens_seen": 8825952, + "step": 13535 + }, + { + "epoch": 7.9834905660377355, + "grad_norm": 2.1712729930877686, + "learning_rate": 7.5129114554058425e-06, + "loss": 0.5284, + "num_input_tokens_seen": 8829184, + "step": 13540 + }, + { + "epoch": 7.986438679245283, + "grad_norm": 2.7428975105285645, + "learning_rate": 7.510686935929963e-06, + "loss": 0.4749, + "num_input_tokens_seen": 8832768, + "step": 13545 + }, + { + "epoch": 7.98938679245283, + "grad_norm": 4.409809589385986, + "learning_rate": 7.5084617517388965e-06, + "loss": 0.3945, + "num_input_tokens_seen": 8835744, + "step": 13550 + }, + { + "epoch": 7.992334905660377, + "grad_norm": 3.4153990745544434, + "learning_rate": 7.506235903421771e-06, + "loss": 0.4586, + "num_input_tokens_seen": 8839232, + "step": 13555 + }, + { + "epoch": 7.995283018867925, + "grad_norm": 3.569145679473877, + "learning_rate": 7.504009391567889e-06, + "loss": 0.3999, + "num_input_tokens_seen": 8842720, + "step": 13560 + }, + { + "epoch": 7.998231132075472, + "grad_norm": 1.6880728006362915, + "learning_rate": 7.501782216766729e-06, + "loss": 0.3557, + "num_input_tokens_seen": 8845792, + "step": 13565 + }, + { + "epoch": 8.0, + "eval_loss": 0.5099420547485352, + "eval_runtime": 19.0498, + "eval_samples_per_second": 89.03, + "eval_steps_per_second": 22.257, + "num_input_tokens_seen": 8848168, + "step": 13568 + }, + { + "epoch": 8.00117924528302, + "grad_norm": 2.666106939315796, + "learning_rate": 7.499554379607944e-06, + "loss": 0.429, + "num_input_tokens_seen": 8849256, + "step": 13570 + }, + { + "epoch": 8.004127358490566, + "grad_norm": 1.7458851337432861, + "learning_rate": 7.497325880681365e-06, + "loss": 0.3561, + "num_input_tokens_seen": 8852584, + "step": 13575 + }, + { + "epoch": 8.007075471698114, + "grad_norm": 2.776974678039551, + "learning_rate": 7.495096720576994e-06, + "loss": 0.4698, + "num_input_tokens_seen": 8855496, + "step": 13580 + }, + { + "epoch": 8.01002358490566, + "grad_norm": 2.4449968338012695, + "learning_rate": 7.492866899885017e-06, + "loss": 0.3735, + "num_input_tokens_seen": 8858056, + "step": 13585 + }, + { + "epoch": 8.012971698113208, + "grad_norm": 2.096892833709717, + "learning_rate": 7.490636419195782e-06, + "loss": 0.5267, + "num_input_tokens_seen": 8863240, + "step": 13590 + }, + { + "epoch": 8.015919811320755, + "grad_norm": 2.1281867027282715, + "learning_rate": 7.488405279099821e-06, + "loss": 0.4313, + "num_input_tokens_seen": 8866536, + "step": 13595 + }, + { + "epoch": 8.018867924528301, + "grad_norm": 2.3399698734283447, + "learning_rate": 7.48617348018784e-06, + "loss": 0.4359, + "num_input_tokens_seen": 8869000, + "step": 13600 + }, + { + "epoch": 8.02181603773585, + "grad_norm": 1.100244402885437, + "learning_rate": 7.4839410230507134e-06, + "loss": 0.3305, + "num_input_tokens_seen": 8871656, + "step": 13605 + }, + { + "epoch": 8.024764150943396, + "grad_norm": 1.7875359058380127, + "learning_rate": 7.481707908279496e-06, + "loss": 0.3883, + "num_input_tokens_seen": 8874792, + "step": 13610 + }, + { + "epoch": 8.027712264150944, + "grad_norm": 1.9129972457885742, + "learning_rate": 7.4794741364654144e-06, + "loss": 0.2673, + "num_input_tokens_seen": 8877896, + "step": 13615 + }, + { + "epoch": 8.03066037735849, + "grad_norm": 3.9335618019104004, + "learning_rate": 7.477239708199871e-06, + "loss": 0.4106, + "num_input_tokens_seen": 8881096, + "step": 13620 + }, + { + "epoch": 8.033608490566039, + "grad_norm": 2.187635660171509, + "learning_rate": 7.475004624074434e-06, + "loss": 0.362, + "num_input_tokens_seen": 8883496, + "step": 13625 + }, + { + "epoch": 8.036556603773585, + "grad_norm": 1.756363034248352, + "learning_rate": 7.4727688846808595e-06, + "loss": 0.5068, + "num_input_tokens_seen": 8886856, + "step": 13630 + }, + { + "epoch": 8.039504716981131, + "grad_norm": 2.8471572399139404, + "learning_rate": 7.4705324906110654e-06, + "loss": 0.4106, + "num_input_tokens_seen": 8890280, + "step": 13635 + }, + { + "epoch": 8.04245283018868, + "grad_norm": 1.8761793375015259, + "learning_rate": 7.4682954424571466e-06, + "loss": 0.4369, + "num_input_tokens_seen": 8893384, + "step": 13640 + }, + { + "epoch": 8.045400943396226, + "grad_norm": 2.593594551086426, + "learning_rate": 7.466057740811372e-06, + "loss": 0.5718, + "num_input_tokens_seen": 8896136, + "step": 13645 + }, + { + "epoch": 8.048349056603774, + "grad_norm": 1.4992259740829468, + "learning_rate": 7.463819386266182e-06, + "loss": 0.4577, + "num_input_tokens_seen": 8899944, + "step": 13650 + }, + { + "epoch": 8.05129716981132, + "grad_norm": 2.8070898056030273, + "learning_rate": 7.461580379414191e-06, + "loss": 0.3305, + "num_input_tokens_seen": 8903400, + "step": 13655 + }, + { + "epoch": 8.054245283018869, + "grad_norm": 4.676871299743652, + "learning_rate": 7.459340720848187e-06, + "loss": 0.3973, + "num_input_tokens_seen": 8906856, + "step": 13660 + }, + { + "epoch": 8.057193396226415, + "grad_norm": 1.3727142810821533, + "learning_rate": 7.457100411161128e-06, + "loss": 0.3616, + "num_input_tokens_seen": 8909832, + "step": 13665 + }, + { + "epoch": 8.060141509433961, + "grad_norm": 5.175848007202148, + "learning_rate": 7.454859450946144e-06, + "loss": 0.5317, + "num_input_tokens_seen": 8912904, + "step": 13670 + }, + { + "epoch": 8.06308962264151, + "grad_norm": 3.9332563877105713, + "learning_rate": 7.4526178407965396e-06, + "loss": 0.4443, + "num_input_tokens_seen": 8916200, + "step": 13675 + }, + { + "epoch": 8.066037735849056, + "grad_norm": 2.150294065475464, + "learning_rate": 7.450375581305794e-06, + "loss": 0.4242, + "num_input_tokens_seen": 8919272, + "step": 13680 + }, + { + "epoch": 8.068985849056604, + "grad_norm": 1.47675621509552, + "learning_rate": 7.448132673067552e-06, + "loss": 0.3985, + "num_input_tokens_seen": 8923272, + "step": 13685 + }, + { + "epoch": 8.07193396226415, + "grad_norm": 2.911299467086792, + "learning_rate": 7.445889116675634e-06, + "loss": 0.6907, + "num_input_tokens_seen": 8927528, + "step": 13690 + }, + { + "epoch": 8.074882075471699, + "grad_norm": 2.4006049633026123, + "learning_rate": 7.443644912724031e-06, + "loss": 0.3503, + "num_input_tokens_seen": 8929992, + "step": 13695 + }, + { + "epoch": 8.077830188679245, + "grad_norm": 2.804715633392334, + "learning_rate": 7.441400061806907e-06, + "loss": 0.4373, + "num_input_tokens_seen": 8933864, + "step": 13700 + }, + { + "epoch": 8.080778301886792, + "grad_norm": 1.6213233470916748, + "learning_rate": 7.439154564518592e-06, + "loss": 0.5058, + "num_input_tokens_seen": 8937832, + "step": 13705 + }, + { + "epoch": 8.08372641509434, + "grad_norm": 1.528058648109436, + "learning_rate": 7.436908421453597e-06, + "loss": 0.3998, + "num_input_tokens_seen": 8941224, + "step": 13710 + }, + { + "epoch": 8.086674528301886, + "grad_norm": 1.7929173707962036, + "learning_rate": 7.434661633206593e-06, + "loss": 0.3248, + "num_input_tokens_seen": 8944168, + "step": 13715 + }, + { + "epoch": 8.089622641509434, + "grad_norm": 3.8835673332214355, + "learning_rate": 7.4324142003724286e-06, + "loss": 0.3541, + "num_input_tokens_seen": 8946792, + "step": 13720 + }, + { + "epoch": 8.09257075471698, + "grad_norm": 1.6249452829360962, + "learning_rate": 7.430166123546122e-06, + "loss": 0.4109, + "num_input_tokens_seen": 8950216, + "step": 13725 + }, + { + "epoch": 8.095518867924529, + "grad_norm": 1.541869878768921, + "learning_rate": 7.427917403322862e-06, + "loss": 0.4462, + "num_input_tokens_seen": 8954312, + "step": 13730 + }, + { + "epoch": 8.098466981132075, + "grad_norm": 2.542351245880127, + "learning_rate": 7.425668040298003e-06, + "loss": 0.4719, + "num_input_tokens_seen": 8958376, + "step": 13735 + }, + { + "epoch": 8.101415094339623, + "grad_norm": 2.4646389484405518, + "learning_rate": 7.4234180350670785e-06, + "loss": 0.336, + "num_input_tokens_seen": 8961384, + "step": 13740 + }, + { + "epoch": 8.10436320754717, + "grad_norm": 2.8588569164276123, + "learning_rate": 7.421167388225785e-06, + "loss": 0.3946, + "num_input_tokens_seen": 8964264, + "step": 13745 + }, + { + "epoch": 8.107311320754716, + "grad_norm": 1.9972561597824097, + "learning_rate": 7.41891610036999e-06, + "loss": 0.4015, + "num_input_tokens_seen": 8966728, + "step": 13750 + }, + { + "epoch": 8.110259433962264, + "grad_norm": 2.407116174697876, + "learning_rate": 7.416664172095732e-06, + "loss": 0.2923, + "num_input_tokens_seen": 8969544, + "step": 13755 + }, + { + "epoch": 8.11320754716981, + "grad_norm": 2.901962995529175, + "learning_rate": 7.414411603999221e-06, + "loss": 0.3625, + "num_input_tokens_seen": 8972776, + "step": 13760 + }, + { + "epoch": 8.116155660377359, + "grad_norm": 3.874617099761963, + "learning_rate": 7.4121583966768295e-06, + "loss": 0.3316, + "num_input_tokens_seen": 8975624, + "step": 13765 + }, + { + "epoch": 8.119103773584905, + "grad_norm": 1.8574072122573853, + "learning_rate": 7.409904550725109e-06, + "loss": 0.4319, + "num_input_tokens_seen": 8980392, + "step": 13770 + }, + { + "epoch": 8.122051886792454, + "grad_norm": 1.4014862775802612, + "learning_rate": 7.407650066740771e-06, + "loss": 0.433, + "num_input_tokens_seen": 8984008, + "step": 13775 + }, + { + "epoch": 8.125, + "grad_norm": 4.182077407836914, + "learning_rate": 7.405394945320702e-06, + "loss": 0.3382, + "num_input_tokens_seen": 8987304, + "step": 13780 + }, + { + "epoch": 8.127948113207546, + "grad_norm": 2.386167526245117, + "learning_rate": 7.403139187061955e-06, + "loss": 0.4656, + "num_input_tokens_seen": 8990216, + "step": 13785 + }, + { + "epoch": 8.130896226415095, + "grad_norm": 3.9744343757629395, + "learning_rate": 7.400882792561752e-06, + "loss": 0.4925, + "num_input_tokens_seen": 8993480, + "step": 13790 + }, + { + "epoch": 8.133844339622641, + "grad_norm": 3.4291646480560303, + "learning_rate": 7.39862576241748e-06, + "loss": 0.3748, + "num_input_tokens_seen": 8999112, + "step": 13795 + }, + { + "epoch": 8.13679245283019, + "grad_norm": 2.106797695159912, + "learning_rate": 7.396368097226703e-06, + "loss": 0.4129, + "num_input_tokens_seen": 9002824, + "step": 13800 + }, + { + "epoch": 8.139740566037736, + "grad_norm": 3.0510683059692383, + "learning_rate": 7.394109797587144e-06, + "loss": 0.4775, + "num_input_tokens_seen": 9005736, + "step": 13805 + }, + { + "epoch": 8.142688679245284, + "grad_norm": 2.3914542198181152, + "learning_rate": 7.3918508640966956e-06, + "loss": 0.4175, + "num_input_tokens_seen": 9009416, + "step": 13810 + }, + { + "epoch": 8.14563679245283, + "grad_norm": 1.9719914197921753, + "learning_rate": 7.389591297353424e-06, + "loss": 0.4633, + "num_input_tokens_seen": 9012456, + "step": 13815 + }, + { + "epoch": 8.148584905660377, + "grad_norm": 2.599912643432617, + "learning_rate": 7.3873310979555565e-06, + "loss": 0.376, + "num_input_tokens_seen": 9015528, + "step": 13820 + }, + { + "epoch": 8.151533018867925, + "grad_norm": 2.6653919219970703, + "learning_rate": 7.385070266501495e-06, + "loss": 0.3802, + "num_input_tokens_seen": 9019112, + "step": 13825 + }, + { + "epoch": 8.154481132075471, + "grad_norm": 2.2161753177642822, + "learning_rate": 7.382808803589798e-06, + "loss": 0.3897, + "num_input_tokens_seen": 9021736, + "step": 13830 + }, + { + "epoch": 8.15742924528302, + "grad_norm": 1.1827857494354248, + "learning_rate": 7.380546709819204e-06, + "loss": 0.4062, + "num_input_tokens_seen": 9024936, + "step": 13835 + }, + { + "epoch": 8.160377358490566, + "grad_norm": 2.122030258178711, + "learning_rate": 7.378283985788608e-06, + "loss": 0.372, + "num_input_tokens_seen": 9027560, + "step": 13840 + }, + { + "epoch": 8.163325471698114, + "grad_norm": 2.7912421226501465, + "learning_rate": 7.376020632097076e-06, + "loss": 0.4653, + "num_input_tokens_seen": 9030504, + "step": 13845 + }, + { + "epoch": 8.16627358490566, + "grad_norm": 1.2989771366119385, + "learning_rate": 7.373756649343841e-06, + "loss": 0.2937, + "num_input_tokens_seen": 9033704, + "step": 13850 + }, + { + "epoch": 8.169221698113208, + "grad_norm": 1.4684436321258545, + "learning_rate": 7.371492038128305e-06, + "loss": 0.3975, + "num_input_tokens_seen": 9037800, + "step": 13855 + }, + { + "epoch": 8.172169811320755, + "grad_norm": 2.1536645889282227, + "learning_rate": 7.36922679905003e-06, + "loss": 0.3701, + "num_input_tokens_seen": 9041160, + "step": 13860 + }, + { + "epoch": 8.175117924528301, + "grad_norm": 2.0741193294525146, + "learning_rate": 7.366960932708749e-06, + "loss": 0.3279, + "num_input_tokens_seen": 9044200, + "step": 13865 + }, + { + "epoch": 8.17806603773585, + "grad_norm": 2.3600454330444336, + "learning_rate": 7.364694439704361e-06, + "loss": 0.348, + "num_input_tokens_seen": 9046856, + "step": 13870 + }, + { + "epoch": 8.181014150943396, + "grad_norm": 4.38322639465332, + "learning_rate": 7.3624273206369264e-06, + "loss": 0.2514, + "num_input_tokens_seen": 9049288, + "step": 13875 + }, + { + "epoch": 8.183962264150944, + "grad_norm": 3.125810384750366, + "learning_rate": 7.360159576106681e-06, + "loss": 0.3868, + "num_input_tokens_seen": 9051976, + "step": 13880 + }, + { + "epoch": 8.18691037735849, + "grad_norm": 1.6389169692993164, + "learning_rate": 7.357891206714014e-06, + "loss": 0.2955, + "num_input_tokens_seen": 9055336, + "step": 13885 + }, + { + "epoch": 8.189858490566039, + "grad_norm": 3.010446071624756, + "learning_rate": 7.355622213059487e-06, + "loss": 0.3901, + "num_input_tokens_seen": 9058120, + "step": 13890 + }, + { + "epoch": 8.192806603773585, + "grad_norm": 2.7184367179870605, + "learning_rate": 7.353352595743829e-06, + "loss": 0.2998, + "num_input_tokens_seen": 9061160, + "step": 13895 + }, + { + "epoch": 8.195754716981131, + "grad_norm": 1.5839126110076904, + "learning_rate": 7.351082355367928e-06, + "loss": 0.4522, + "num_input_tokens_seen": 9064552, + "step": 13900 + }, + { + "epoch": 8.19870283018868, + "grad_norm": 2.3468291759490967, + "learning_rate": 7.34881149253284e-06, + "loss": 0.3219, + "num_input_tokens_seen": 9068488, + "step": 13905 + }, + { + "epoch": 8.201650943396226, + "grad_norm": 2.5472218990325928, + "learning_rate": 7.346540007839787e-06, + "loss": 0.4691, + "num_input_tokens_seen": 9071752, + "step": 13910 + }, + { + "epoch": 8.204599056603774, + "grad_norm": 5.368915557861328, + "learning_rate": 7.344267901890154e-06, + "loss": 0.4471, + "num_input_tokens_seen": 9074120, + "step": 13915 + }, + { + "epoch": 8.20754716981132, + "grad_norm": 3.0006468296051025, + "learning_rate": 7.341995175285491e-06, + "loss": 0.4004, + "num_input_tokens_seen": 9076968, + "step": 13920 + }, + { + "epoch": 8.210495283018869, + "grad_norm": 2.642289638519287, + "learning_rate": 7.339721828627512e-06, + "loss": 0.4406, + "num_input_tokens_seen": 9080264, + "step": 13925 + }, + { + "epoch": 8.213443396226415, + "grad_norm": 3.0328216552734375, + "learning_rate": 7.337447862518096e-06, + "loss": 0.2811, + "num_input_tokens_seen": 9082984, + "step": 13930 + }, + { + "epoch": 8.216391509433961, + "grad_norm": 1.7147077322006226, + "learning_rate": 7.335173277559282e-06, + "loss": 0.3452, + "num_input_tokens_seen": 9086344, + "step": 13935 + }, + { + "epoch": 8.21933962264151, + "grad_norm": 2.7895584106445312, + "learning_rate": 7.332898074353281e-06, + "loss": 0.3691, + "num_input_tokens_seen": 9089448, + "step": 13940 + }, + { + "epoch": 8.222287735849056, + "grad_norm": 1.8917620182037354, + "learning_rate": 7.330622253502461e-06, + "loss": 0.2899, + "num_input_tokens_seen": 9092680, + "step": 13945 + }, + { + "epoch": 8.225235849056604, + "grad_norm": 1.8764305114746094, + "learning_rate": 7.3283458156093534e-06, + "loss": 0.2974, + "num_input_tokens_seen": 9095304, + "step": 13950 + }, + { + "epoch": 8.22818396226415, + "grad_norm": 1.8710672855377197, + "learning_rate": 7.326068761276657e-06, + "loss": 0.3705, + "num_input_tokens_seen": 9098504, + "step": 13955 + }, + { + "epoch": 8.231132075471699, + "grad_norm": 2.5115270614624023, + "learning_rate": 7.323791091107231e-06, + "loss": 0.36, + "num_input_tokens_seen": 9101512, + "step": 13960 + }, + { + "epoch": 8.234080188679245, + "grad_norm": 5.925174236297607, + "learning_rate": 7.3215128057040986e-06, + "loss": 0.4934, + "num_input_tokens_seen": 9103880, + "step": 13965 + }, + { + "epoch": 8.237028301886792, + "grad_norm": 6.538854598999023, + "learning_rate": 7.319233905670447e-06, + "loss": 0.4085, + "num_input_tokens_seen": 9106440, + "step": 13970 + }, + { + "epoch": 8.23997641509434, + "grad_norm": 2.83817720413208, + "learning_rate": 7.316954391609622e-06, + "loss": 0.3234, + "num_input_tokens_seen": 9109576, + "step": 13975 + }, + { + "epoch": 8.242924528301886, + "grad_norm": 2.789663791656494, + "learning_rate": 7.314674264125137e-06, + "loss": 0.4577, + "num_input_tokens_seen": 9113128, + "step": 13980 + }, + { + "epoch": 8.245872641509434, + "grad_norm": 2.0470354557037354, + "learning_rate": 7.312393523820665e-06, + "loss": 0.46, + "num_input_tokens_seen": 9116552, + "step": 13985 + }, + { + "epoch": 8.24882075471698, + "grad_norm": 2.7704360485076904, + "learning_rate": 7.310112171300041e-06, + "loss": 0.4312, + "num_input_tokens_seen": 9119656, + "step": 13990 + }, + { + "epoch": 8.251768867924529, + "grad_norm": 3.5966460704803467, + "learning_rate": 7.307830207167263e-06, + "loss": 0.4338, + "num_input_tokens_seen": 9122568, + "step": 13995 + }, + { + "epoch": 8.254716981132075, + "grad_norm": 4.281689167022705, + "learning_rate": 7.305547632026493e-06, + "loss": 0.4016, + "num_input_tokens_seen": 9125160, + "step": 14000 + }, + { + "epoch": 8.257665094339623, + "grad_norm": 3.0443994998931885, + "learning_rate": 7.3032644464820515e-06, + "loss": 0.4734, + "num_input_tokens_seen": 9128520, + "step": 14005 + }, + { + "epoch": 8.26061320754717, + "grad_norm": 2.897758960723877, + "learning_rate": 7.30098065113842e-06, + "loss": 0.3919, + "num_input_tokens_seen": 9132104, + "step": 14010 + }, + { + "epoch": 8.263561320754716, + "grad_norm": 1.5462449789047241, + "learning_rate": 7.298696246600244e-06, + "loss": 0.4209, + "num_input_tokens_seen": 9135432, + "step": 14015 + }, + { + "epoch": 8.266509433962264, + "grad_norm": 2.461265802383423, + "learning_rate": 7.2964112334723315e-06, + "loss": 0.3581, + "num_input_tokens_seen": 9139944, + "step": 14020 + }, + { + "epoch": 8.26945754716981, + "grad_norm": 2.317335605621338, + "learning_rate": 7.294125612359647e-06, + "loss": 0.389, + "num_input_tokens_seen": 9143208, + "step": 14025 + }, + { + "epoch": 8.272405660377359, + "grad_norm": 2.1711039543151855, + "learning_rate": 7.291839383867318e-06, + "loss": 0.2363, + "num_input_tokens_seen": 9146216, + "step": 14030 + }, + { + "epoch": 8.275353773584905, + "grad_norm": 1.928035855293274, + "learning_rate": 7.289552548600638e-06, + "loss": 0.3073, + "num_input_tokens_seen": 9149288, + "step": 14035 + }, + { + "epoch": 8.278301886792454, + "grad_norm": 3.0396032333374023, + "learning_rate": 7.287265107165052e-06, + "loss": 0.3135, + "num_input_tokens_seen": 9153448, + "step": 14040 + }, + { + "epoch": 8.28125, + "grad_norm": 2.2305355072021484, + "learning_rate": 7.284977060166171e-06, + "loss": 0.4462, + "num_input_tokens_seen": 9157832, + "step": 14045 + }, + { + "epoch": 8.284198113207546, + "grad_norm": 2.923834800720215, + "learning_rate": 7.282688408209766e-06, + "loss": 0.4026, + "num_input_tokens_seen": 9161096, + "step": 14050 + }, + { + "epoch": 8.287146226415095, + "grad_norm": 3.395155429840088, + "learning_rate": 7.2803991519017655e-06, + "loss": 0.4475, + "num_input_tokens_seen": 9164104, + "step": 14055 + }, + { + "epoch": 8.290094339622641, + "grad_norm": 2.713153600692749, + "learning_rate": 7.2781092918482634e-06, + "loss": 0.4329, + "num_input_tokens_seen": 9166728, + "step": 14060 + }, + { + "epoch": 8.29304245283019, + "grad_norm": 2.379946708679199, + "learning_rate": 7.275818828655508e-06, + "loss": 0.4178, + "num_input_tokens_seen": 9170056, + "step": 14065 + }, + { + "epoch": 8.295990566037736, + "grad_norm": 3.942443609237671, + "learning_rate": 7.27352776292991e-06, + "loss": 0.3621, + "num_input_tokens_seen": 9173192, + "step": 14070 + }, + { + "epoch": 8.298938679245284, + "grad_norm": 2.1148619651794434, + "learning_rate": 7.271236095278036e-06, + "loss": 0.4185, + "num_input_tokens_seen": 9176616, + "step": 14075 + }, + { + "epoch": 8.30188679245283, + "grad_norm": 3.0360612869262695, + "learning_rate": 7.2689438263066195e-06, + "loss": 0.4694, + "num_input_tokens_seen": 9179976, + "step": 14080 + }, + { + "epoch": 8.304834905660377, + "grad_norm": 3.7810540199279785, + "learning_rate": 7.266650956622546e-06, + "loss": 0.4752, + "num_input_tokens_seen": 9183080, + "step": 14085 + }, + { + "epoch": 8.307783018867925, + "grad_norm": 1.861411690711975, + "learning_rate": 7.2643574868328625e-06, + "loss": 0.4647, + "num_input_tokens_seen": 9185416, + "step": 14090 + }, + { + "epoch": 8.310731132075471, + "grad_norm": 2.0562405586242676, + "learning_rate": 7.262063417544776e-06, + "loss": 0.3161, + "num_input_tokens_seen": 9188392, + "step": 14095 + }, + { + "epoch": 8.31367924528302, + "grad_norm": 4.4509596824646, + "learning_rate": 7.25976874936565e-06, + "loss": 0.4071, + "num_input_tokens_seen": 9192360, + "step": 14100 + }, + { + "epoch": 8.316627358490566, + "grad_norm": 4.977168083190918, + "learning_rate": 7.257473482903009e-06, + "loss": 0.4578, + "num_input_tokens_seen": 9194728, + "step": 14105 + }, + { + "epoch": 8.319575471698114, + "grad_norm": 2.217033863067627, + "learning_rate": 7.255177618764534e-06, + "loss": 0.4118, + "num_input_tokens_seen": 9198216, + "step": 14110 + }, + { + "epoch": 8.32252358490566, + "grad_norm": 2.2413735389709473, + "learning_rate": 7.252881157558065e-06, + "loss": 0.4386, + "num_input_tokens_seen": 9201768, + "step": 14115 + }, + { + "epoch": 8.325471698113208, + "grad_norm": 3.2492992877960205, + "learning_rate": 7.250584099891602e-06, + "loss": 0.3745, + "num_input_tokens_seen": 9204712, + "step": 14120 + }, + { + "epoch": 8.328419811320755, + "grad_norm": 3.902268648147583, + "learning_rate": 7.248286446373296e-06, + "loss": 0.3689, + "num_input_tokens_seen": 9207112, + "step": 14125 + }, + { + "epoch": 8.331367924528301, + "grad_norm": 1.6623175144195557, + "learning_rate": 7.245988197611466e-06, + "loss": 0.3721, + "num_input_tokens_seen": 9210568, + "step": 14130 + }, + { + "epoch": 8.33431603773585, + "grad_norm": 2.9371471405029297, + "learning_rate": 7.2436893542145805e-06, + "loss": 0.3289, + "num_input_tokens_seen": 9214248, + "step": 14135 + }, + { + "epoch": 8.337264150943396, + "grad_norm": 2.0764291286468506, + "learning_rate": 7.241389916791269e-06, + "loss": 0.4597, + "num_input_tokens_seen": 9218376, + "step": 14140 + }, + { + "epoch": 8.340212264150944, + "grad_norm": 1.462090015411377, + "learning_rate": 7.239089885950317e-06, + "loss": 0.4129, + "num_input_tokens_seen": 9222152, + "step": 14145 + }, + { + "epoch": 8.34316037735849, + "grad_norm": 1.7906250953674316, + "learning_rate": 7.236789262300667e-06, + "loss": 0.3677, + "num_input_tokens_seen": 9225448, + "step": 14150 + }, + { + "epoch": 8.346108490566039, + "grad_norm": 1.7789106369018555, + "learning_rate": 7.23448804645142e-06, + "loss": 0.4122, + "num_input_tokens_seen": 9228904, + "step": 14155 + }, + { + "epoch": 8.349056603773585, + "grad_norm": 2.3959455490112305, + "learning_rate": 7.232186239011834e-06, + "loss": 0.3969, + "num_input_tokens_seen": 9232456, + "step": 14160 + }, + { + "epoch": 8.352004716981131, + "grad_norm": 3.3744118213653564, + "learning_rate": 7.2298838405913195e-06, + "loss": 0.3695, + "num_input_tokens_seen": 9235496, + "step": 14165 + }, + { + "epoch": 8.35495283018868, + "grad_norm": 2.167912721633911, + "learning_rate": 7.227580851799448e-06, + "loss": 0.4045, + "num_input_tokens_seen": 9238184, + "step": 14170 + }, + { + "epoch": 8.357900943396226, + "grad_norm": 2.4132373332977295, + "learning_rate": 7.2252772732459455e-06, + "loss": 0.4096, + "num_input_tokens_seen": 9240680, + "step": 14175 + }, + { + "epoch": 8.360849056603774, + "grad_norm": 6.015293121337891, + "learning_rate": 7.222973105540696e-06, + "loss": 0.3944, + "num_input_tokens_seen": 9243944, + "step": 14180 + }, + { + "epoch": 8.36379716981132, + "grad_norm": 1.475831151008606, + "learning_rate": 7.2206683492937345e-06, + "loss": 0.3471, + "num_input_tokens_seen": 9246792, + "step": 14185 + }, + { + "epoch": 8.366745283018869, + "grad_norm": 1.8131763935089111, + "learning_rate": 7.218363005115259e-06, + "loss": 0.3932, + "num_input_tokens_seen": 9251048, + "step": 14190 + }, + { + "epoch": 8.369693396226415, + "grad_norm": 2.7515087127685547, + "learning_rate": 7.216057073615617e-06, + "loss": 0.4088, + "num_input_tokens_seen": 9254856, + "step": 14195 + }, + { + "epoch": 8.372641509433961, + "grad_norm": 4.056880950927734, + "learning_rate": 7.21375055540531e-06, + "loss": 0.4401, + "num_input_tokens_seen": 9257832, + "step": 14200 + }, + { + "epoch": 8.37558962264151, + "grad_norm": 2.2605767250061035, + "learning_rate": 7.211443451095007e-06, + "loss": 0.3207, + "num_input_tokens_seen": 9261352, + "step": 14205 + }, + { + "epoch": 8.378537735849056, + "grad_norm": 1.5928245782852173, + "learning_rate": 7.2091357612955185e-06, + "loss": 0.3891, + "num_input_tokens_seen": 9265096, + "step": 14210 + }, + { + "epoch": 8.381485849056604, + "grad_norm": 2.0999646186828613, + "learning_rate": 7.206827486617816e-06, + "loss": 0.3951, + "num_input_tokens_seen": 9268904, + "step": 14215 + }, + { + "epoch": 8.38443396226415, + "grad_norm": 3.4141688346862793, + "learning_rate": 7.204518627673026e-06, + "loss": 0.5359, + "num_input_tokens_seen": 9271464, + "step": 14220 + }, + { + "epoch": 8.387382075471699, + "grad_norm": 2.6523115634918213, + "learning_rate": 7.202209185072428e-06, + "loss": 0.549, + "num_input_tokens_seen": 9274280, + "step": 14225 + }, + { + "epoch": 8.390330188679245, + "grad_norm": 2.240976572036743, + "learning_rate": 7.199899159427457e-06, + "loss": 0.3769, + "num_input_tokens_seen": 9277224, + "step": 14230 + }, + { + "epoch": 8.393278301886792, + "grad_norm": 1.6264857053756714, + "learning_rate": 7.1975885513497035e-06, + "loss": 0.3814, + "num_input_tokens_seen": 9281224, + "step": 14235 + }, + { + "epoch": 8.39622641509434, + "grad_norm": 4.339727401733398, + "learning_rate": 7.195277361450909e-06, + "loss": 0.4956, + "num_input_tokens_seen": 9283912, + "step": 14240 + }, + { + "epoch": 8.399174528301886, + "grad_norm": 2.5912351608276367, + "learning_rate": 7.192965590342973e-06, + "loss": 0.628, + "num_input_tokens_seen": 9288616, + "step": 14245 + }, + { + "epoch": 8.402122641509434, + "grad_norm": 2.866222381591797, + "learning_rate": 7.190653238637945e-06, + "loss": 0.493, + "num_input_tokens_seen": 9291400, + "step": 14250 + }, + { + "epoch": 8.40507075471698, + "grad_norm": 3.4372546672821045, + "learning_rate": 7.18834030694803e-06, + "loss": 0.4921, + "num_input_tokens_seen": 9294632, + "step": 14255 + }, + { + "epoch": 8.408018867924529, + "grad_norm": 3.0348970890045166, + "learning_rate": 7.186026795885589e-06, + "loss": 0.4218, + "num_input_tokens_seen": 9298664, + "step": 14260 + }, + { + "epoch": 8.410966981132075, + "grad_norm": 2.5941033363342285, + "learning_rate": 7.183712706063133e-06, + "loss": 0.3715, + "num_input_tokens_seen": 9301544, + "step": 14265 + }, + { + "epoch": 8.413915094339623, + "grad_norm": 2.5344462394714355, + "learning_rate": 7.1813980380933255e-06, + "loss": 0.4263, + "num_input_tokens_seen": 9304648, + "step": 14270 + }, + { + "epoch": 8.41686320754717, + "grad_norm": 0.8082200288772583, + "learning_rate": 7.179082792588986e-06, + "loss": 0.3954, + "num_input_tokens_seen": 9307752, + "step": 14275 + }, + { + "epoch": 8.419811320754716, + "grad_norm": 2.5596089363098145, + "learning_rate": 7.176766970163087e-06, + "loss": 0.3884, + "num_input_tokens_seen": 9310472, + "step": 14280 + }, + { + "epoch": 8.422759433962264, + "grad_norm": 2.9833126068115234, + "learning_rate": 7.17445057142875e-06, + "loss": 0.3593, + "num_input_tokens_seen": 9314312, + "step": 14285 + }, + { + "epoch": 8.42570754716981, + "grad_norm": 3.1281986236572266, + "learning_rate": 7.172133596999253e-06, + "loss": 0.3455, + "num_input_tokens_seen": 9317704, + "step": 14290 + }, + { + "epoch": 8.428655660377359, + "grad_norm": 2.5291125774383545, + "learning_rate": 7.1698160474880255e-06, + "loss": 0.5131, + "num_input_tokens_seen": 9321000, + "step": 14295 + }, + { + "epoch": 8.431603773584905, + "grad_norm": 1.9944555759429932, + "learning_rate": 7.167497923508648e-06, + "loss": 0.3584, + "num_input_tokens_seen": 9323688, + "step": 14300 + }, + { + "epoch": 8.434551886792454, + "grad_norm": 2.2621262073516846, + "learning_rate": 7.165179225674854e-06, + "loss": 0.4258, + "num_input_tokens_seen": 9326568, + "step": 14305 + }, + { + "epoch": 8.4375, + "grad_norm": 2.481161594390869, + "learning_rate": 7.1628599546005276e-06, + "loss": 0.338, + "num_input_tokens_seen": 9329224, + "step": 14310 + }, + { + "epoch": 8.440448113207546, + "grad_norm": 1.8259048461914062, + "learning_rate": 7.160540110899708e-06, + "loss": 0.3929, + "num_input_tokens_seen": 9332360, + "step": 14315 + }, + { + "epoch": 8.443396226415095, + "grad_norm": 4.227758407592773, + "learning_rate": 7.158219695186582e-06, + "loss": 0.5698, + "num_input_tokens_seen": 9335176, + "step": 14320 + }, + { + "epoch": 8.446344339622641, + "grad_norm": 2.5277490615844727, + "learning_rate": 7.1558987080754905e-06, + "loss": 0.456, + "num_input_tokens_seen": 9339624, + "step": 14325 + }, + { + "epoch": 8.44929245283019, + "grad_norm": 1.8370674848556519, + "learning_rate": 7.1535771501809245e-06, + "loss": 0.4361, + "num_input_tokens_seen": 9342696, + "step": 14330 + }, + { + "epoch": 8.452240566037736, + "grad_norm": 3.5850014686584473, + "learning_rate": 7.151255022117527e-06, + "loss": 0.3994, + "num_input_tokens_seen": 9345000, + "step": 14335 + }, + { + "epoch": 8.455188679245284, + "grad_norm": 1.6765283346176147, + "learning_rate": 7.148932324500091e-06, + "loss": 0.2395, + "num_input_tokens_seen": 9348072, + "step": 14340 + }, + { + "epoch": 8.45813679245283, + "grad_norm": 2.8801708221435547, + "learning_rate": 7.146609057943559e-06, + "loss": 0.3975, + "num_input_tokens_seen": 9351592, + "step": 14345 + }, + { + "epoch": 8.461084905660377, + "grad_norm": 3.0807976722717285, + "learning_rate": 7.14428522306303e-06, + "loss": 0.4588, + "num_input_tokens_seen": 9354312, + "step": 14350 + }, + { + "epoch": 8.464033018867925, + "grad_norm": 1.7765644788742065, + "learning_rate": 7.141960820473745e-06, + "loss": 0.302, + "num_input_tokens_seen": 9357288, + "step": 14355 + }, + { + "epoch": 8.466981132075471, + "grad_norm": 2.011965751647949, + "learning_rate": 7.139635850791102e-06, + "loss": 0.4624, + "num_input_tokens_seen": 9360040, + "step": 14360 + }, + { + "epoch": 8.46992924528302, + "grad_norm": 2.324897527694702, + "learning_rate": 7.137310314630647e-06, + "loss": 0.4772, + "num_input_tokens_seen": 9363112, + "step": 14365 + }, + { + "epoch": 8.472877358490566, + "grad_norm": 3.6103196144104004, + "learning_rate": 7.134984212608074e-06, + "loss": 0.3807, + "num_input_tokens_seen": 9369000, + "step": 14370 + }, + { + "epoch": 8.475825471698114, + "grad_norm": 2.622830629348755, + "learning_rate": 7.13265754533923e-06, + "loss": 0.3546, + "num_input_tokens_seen": 9372264, + "step": 14375 + }, + { + "epoch": 8.47877358490566, + "grad_norm": 13.428444862365723, + "learning_rate": 7.130330313440109e-06, + "loss": 0.4717, + "num_input_tokens_seen": 9375176, + "step": 14380 + }, + { + "epoch": 8.481721698113208, + "grad_norm": 2.7904152870178223, + "learning_rate": 7.128002517526856e-06, + "loss": 0.3451, + "num_input_tokens_seen": 9378088, + "step": 14385 + }, + { + "epoch": 8.484669811320755, + "grad_norm": 2.313046455383301, + "learning_rate": 7.1256741582157654e-06, + "loss": 0.3807, + "num_input_tokens_seen": 9381448, + "step": 14390 + }, + { + "epoch": 8.487617924528301, + "grad_norm": 1.846826195716858, + "learning_rate": 7.123345236123282e-06, + "loss": 0.3888, + "num_input_tokens_seen": 9385576, + "step": 14395 + }, + { + "epoch": 8.49056603773585, + "grad_norm": 2.162078857421875, + "learning_rate": 7.121015751865994e-06, + "loss": 0.3748, + "num_input_tokens_seen": 9388296, + "step": 14400 + }, + { + "epoch": 8.493514150943396, + "grad_norm": 1.653348684310913, + "learning_rate": 7.118685706060645e-06, + "loss": 0.4586, + "num_input_tokens_seen": 9392552, + "step": 14405 + }, + { + "epoch": 8.496462264150944, + "grad_norm": 1.6599537134170532, + "learning_rate": 7.116355099324126e-06, + "loss": 0.3036, + "num_input_tokens_seen": 9396296, + "step": 14410 + }, + { + "epoch": 8.49941037735849, + "grad_norm": 3.6936824321746826, + "learning_rate": 7.114023932273471e-06, + "loss": 0.4239, + "num_input_tokens_seen": 9398408, + "step": 14415 + }, + { + "epoch": 8.502358490566039, + "grad_norm": 1.5160828828811646, + "learning_rate": 7.111692205525871e-06, + "loss": 0.3825, + "num_input_tokens_seen": 9401352, + "step": 14420 + }, + { + "epoch": 8.505306603773585, + "grad_norm": 2.0115528106689453, + "learning_rate": 7.109359919698658e-06, + "loss": 0.4372, + "num_input_tokens_seen": 9404360, + "step": 14425 + }, + { + "epoch": 8.508254716981131, + "grad_norm": 4.964980125427246, + "learning_rate": 7.107027075409316e-06, + "loss": 0.3583, + "num_input_tokens_seen": 9407080, + "step": 14430 + }, + { + "epoch": 8.51120283018868, + "grad_norm": 3.491225242614746, + "learning_rate": 7.104693673275475e-06, + "loss": 0.5223, + "num_input_tokens_seen": 9410216, + "step": 14435 + }, + { + "epoch": 8.514150943396226, + "grad_norm": 2.5000360012054443, + "learning_rate": 7.1023597139149115e-06, + "loss": 0.53, + "num_input_tokens_seen": 9413352, + "step": 14440 + }, + { + "epoch": 8.517099056603774, + "grad_norm": 2.085036277770996, + "learning_rate": 7.100025197945555e-06, + "loss": 0.2691, + "num_input_tokens_seen": 9417128, + "step": 14445 + }, + { + "epoch": 8.52004716981132, + "grad_norm": 2.489471197128296, + "learning_rate": 7.097690125985476e-06, + "loss": 0.3329, + "num_input_tokens_seen": 9419656, + "step": 14450 + }, + { + "epoch": 8.522995283018869, + "grad_norm": 3.95540452003479, + "learning_rate": 7.095354498652895e-06, + "loss": 0.4832, + "num_input_tokens_seen": 9422568, + "step": 14455 + }, + { + "epoch": 8.525943396226415, + "grad_norm": 4.1191229820251465, + "learning_rate": 7.093018316566182e-06, + "loss": 0.3866, + "num_input_tokens_seen": 9425032, + "step": 14460 + }, + { + "epoch": 8.528891509433961, + "grad_norm": 2.906787633895874, + "learning_rate": 7.0906815803438465e-06, + "loss": 0.5568, + "num_input_tokens_seen": 9428904, + "step": 14465 + }, + { + "epoch": 8.53183962264151, + "grad_norm": 4.482585906982422, + "learning_rate": 7.088344290604554e-06, + "loss": 0.4224, + "num_input_tokens_seen": 9431912, + "step": 14470 + }, + { + "epoch": 8.534787735849056, + "grad_norm": 1.3960684537887573, + "learning_rate": 7.086006447967111e-06, + "loss": 0.3612, + "num_input_tokens_seen": 9435464, + "step": 14475 + }, + { + "epoch": 8.537735849056604, + "grad_norm": 3.8985610008239746, + "learning_rate": 7.08366805305047e-06, + "loss": 0.5258, + "num_input_tokens_seen": 9437832, + "step": 14480 + }, + { + "epoch": 8.54068396226415, + "grad_norm": 4.417331218719482, + "learning_rate": 7.08132910647373e-06, + "loss": 0.4489, + "num_input_tokens_seen": 9440328, + "step": 14485 + }, + { + "epoch": 8.543632075471699, + "grad_norm": 3.271285057067871, + "learning_rate": 7.078989608856142e-06, + "loss": 0.3863, + "num_input_tokens_seen": 9443304, + "step": 14490 + }, + { + "epoch": 8.546580188679245, + "grad_norm": 0.3511248826980591, + "learning_rate": 7.076649560817092e-06, + "loss": 0.4744, + "num_input_tokens_seen": 9449608, + "step": 14495 + }, + { + "epoch": 8.549528301886792, + "grad_norm": 1.8501567840576172, + "learning_rate": 7.0743089629761245e-06, + "loss": 0.342, + "num_input_tokens_seen": 9452520, + "step": 14500 + }, + { + "epoch": 8.55247641509434, + "grad_norm": 2.4843695163726807, + "learning_rate": 7.071967815952917e-06, + "loss": 0.288, + "num_input_tokens_seen": 9454728, + "step": 14505 + }, + { + "epoch": 8.555424528301886, + "grad_norm": 2.3058955669403076, + "learning_rate": 7.0696261203673e-06, + "loss": 0.3987, + "num_input_tokens_seen": 9457832, + "step": 14510 + }, + { + "epoch": 8.558372641509434, + "grad_norm": 3.246218681335449, + "learning_rate": 7.067283876839249e-06, + "loss": 0.3172, + "num_input_tokens_seen": 9460424, + "step": 14515 + }, + { + "epoch": 8.56132075471698, + "grad_norm": 1.9781414270401, + "learning_rate": 7.064941085988884e-06, + "loss": 0.4047, + "num_input_tokens_seen": 9463848, + "step": 14520 + }, + { + "epoch": 8.564268867924529, + "grad_norm": 1.635388731956482, + "learning_rate": 7.062597748436464e-06, + "loss": 0.3855, + "num_input_tokens_seen": 9467368, + "step": 14525 + }, + { + "epoch": 8.567216981132075, + "grad_norm": 5.490896701812744, + "learning_rate": 7.060253864802402e-06, + "loss": 0.4486, + "num_input_tokens_seen": 9470280, + "step": 14530 + }, + { + "epoch": 8.570165094339622, + "grad_norm": 3.234849452972412, + "learning_rate": 7.05790943570725e-06, + "loss": 0.2998, + "num_input_tokens_seen": 9473128, + "step": 14535 + }, + { + "epoch": 8.57311320754717, + "grad_norm": 2.327120065689087, + "learning_rate": 7.055564461771704e-06, + "loss": 0.3409, + "num_input_tokens_seen": 9477224, + "step": 14540 + }, + { + "epoch": 8.576061320754716, + "grad_norm": 1.818238377571106, + "learning_rate": 7.053218943616611e-06, + "loss": 0.3431, + "num_input_tokens_seen": 9480488, + "step": 14545 + }, + { + "epoch": 8.579009433962264, + "grad_norm": 2.931459903717041, + "learning_rate": 7.050872881862952e-06, + "loss": 0.3548, + "num_input_tokens_seen": 9483752, + "step": 14550 + }, + { + "epoch": 8.58195754716981, + "grad_norm": 2.881089925765991, + "learning_rate": 7.04852627713186e-06, + "loss": 0.3602, + "num_input_tokens_seen": 9487144, + "step": 14555 + }, + { + "epoch": 8.584905660377359, + "grad_norm": 2.698629856109619, + "learning_rate": 7.0461791300446055e-06, + "loss": 0.3691, + "num_input_tokens_seen": 9489960, + "step": 14560 + }, + { + "epoch": 8.587853773584905, + "grad_norm": 4.952970027923584, + "learning_rate": 7.043831441222611e-06, + "loss": 0.5597, + "num_input_tokens_seen": 9494568, + "step": 14565 + }, + { + "epoch": 8.590801886792454, + "grad_norm": 3.6802735328674316, + "learning_rate": 7.04148321128743e-06, + "loss": 0.447, + "num_input_tokens_seen": 9498216, + "step": 14570 + }, + { + "epoch": 8.59375, + "grad_norm": 3.7266273498535156, + "learning_rate": 7.039134440860773e-06, + "loss": 0.4238, + "num_input_tokens_seen": 9502408, + "step": 14575 + }, + { + "epoch": 8.596698113207546, + "grad_norm": 1.7840392589569092, + "learning_rate": 7.036785130564484e-06, + "loss": 0.3865, + "num_input_tokens_seen": 9505320, + "step": 14580 + }, + { + "epoch": 8.599646226415095, + "grad_norm": 1.9658892154693604, + "learning_rate": 7.0344352810205544e-06, + "loss": 0.432, + "num_input_tokens_seen": 9508232, + "step": 14585 + }, + { + "epoch": 8.602594339622641, + "grad_norm": 2.8039045333862305, + "learning_rate": 7.032084892851115e-06, + "loss": 0.3003, + "num_input_tokens_seen": 9511336, + "step": 14590 + }, + { + "epoch": 8.60554245283019, + "grad_norm": 2.4071102142333984, + "learning_rate": 7.0297339666784425e-06, + "loss": 0.4037, + "num_input_tokens_seen": 9514600, + "step": 14595 + }, + { + "epoch": 8.608490566037736, + "grad_norm": 1.9058986902236938, + "learning_rate": 7.0273825031249556e-06, + "loss": 0.3441, + "num_input_tokens_seen": 9517608, + "step": 14600 + }, + { + "epoch": 8.611438679245284, + "grad_norm": 2.071061134338379, + "learning_rate": 7.025030502813213e-06, + "loss": 0.296, + "num_input_tokens_seen": 9521000, + "step": 14605 + }, + { + "epoch": 8.61438679245283, + "grad_norm": 4.542070388793945, + "learning_rate": 7.022677966365917e-06, + "loss": 0.4603, + "num_input_tokens_seen": 9523720, + "step": 14610 + }, + { + "epoch": 8.617334905660378, + "grad_norm": 2.9570059776306152, + "learning_rate": 7.020324894405913e-06, + "loss": 0.3647, + "num_input_tokens_seen": 9527080, + "step": 14615 + }, + { + "epoch": 8.620283018867925, + "grad_norm": 3.77510142326355, + "learning_rate": 7.017971287556185e-06, + "loss": 0.4807, + "num_input_tokens_seen": 9531368, + "step": 14620 + }, + { + "epoch": 8.623231132075471, + "grad_norm": 3.354464530944824, + "learning_rate": 7.015617146439863e-06, + "loss": 0.3272, + "num_input_tokens_seen": 9534376, + "step": 14625 + }, + { + "epoch": 8.62617924528302, + "grad_norm": 1.8330391645431519, + "learning_rate": 7.0132624716802125e-06, + "loss": 0.4818, + "num_input_tokens_seen": 9538504, + "step": 14630 + }, + { + "epoch": 8.629127358490566, + "grad_norm": 1.7349047660827637, + "learning_rate": 7.0109072639006474e-06, + "loss": 0.3811, + "num_input_tokens_seen": 9541864, + "step": 14635 + }, + { + "epoch": 8.632075471698114, + "grad_norm": 5.677450180053711, + "learning_rate": 7.008551523724717e-06, + "loss": 0.4787, + "num_input_tokens_seen": 9544712, + "step": 14640 + }, + { + "epoch": 8.63502358490566, + "grad_norm": 1.7557170391082764, + "learning_rate": 7.006195251776116e-06, + "loss": 0.4003, + "num_input_tokens_seen": 9547016, + "step": 14645 + }, + { + "epoch": 8.637971698113208, + "grad_norm": 1.952061653137207, + "learning_rate": 7.003838448678674e-06, + "loss": 0.3155, + "num_input_tokens_seen": 9549736, + "step": 14650 + }, + { + "epoch": 8.640919811320755, + "grad_norm": 2.693075656890869, + "learning_rate": 7.0014811150563675e-06, + "loss": 0.4608, + "num_input_tokens_seen": 9552936, + "step": 14655 + }, + { + "epoch": 8.643867924528301, + "grad_norm": 3.187349319458008, + "learning_rate": 6.999123251533311e-06, + "loss": 0.4517, + "num_input_tokens_seen": 9557064, + "step": 14660 + }, + { + "epoch": 8.64681603773585, + "grad_norm": 3.3136374950408936, + "learning_rate": 6.996764858733756e-06, + "loss": 0.4811, + "num_input_tokens_seen": 9560968, + "step": 14665 + }, + { + "epoch": 8.649764150943396, + "grad_norm": 2.0213358402252197, + "learning_rate": 6.994405937282099e-06, + "loss": 0.4053, + "num_input_tokens_seen": 9564680, + "step": 14670 + }, + { + "epoch": 8.652712264150944, + "grad_norm": 5.869167327880859, + "learning_rate": 6.9920464878028745e-06, + "loss": 0.4502, + "num_input_tokens_seen": 9567944, + "step": 14675 + }, + { + "epoch": 8.65566037735849, + "grad_norm": 2.5663812160491943, + "learning_rate": 6.989686510920758e-06, + "loss": 0.4961, + "num_input_tokens_seen": 9571272, + "step": 14680 + }, + { + "epoch": 8.658608490566039, + "grad_norm": 3.2090396881103516, + "learning_rate": 6.9873260072605634e-06, + "loss": 0.3187, + "num_input_tokens_seen": 9576104, + "step": 14685 + }, + { + "epoch": 8.661556603773585, + "grad_norm": 5.881671905517578, + "learning_rate": 6.984964977447243e-06, + "loss": 0.4937, + "num_input_tokens_seen": 9578920, + "step": 14690 + }, + { + "epoch": 8.664504716981131, + "grad_norm": 2.5772156715393066, + "learning_rate": 6.982603422105889e-06, + "loss": 0.3695, + "num_input_tokens_seen": 9584136, + "step": 14695 + }, + { + "epoch": 8.66745283018868, + "grad_norm": 4.4444355964660645, + "learning_rate": 6.980241341861736e-06, + "loss": 0.4547, + "num_input_tokens_seen": 9589768, + "step": 14700 + }, + { + "epoch": 8.670400943396226, + "grad_norm": 2.446840286254883, + "learning_rate": 6.977878737340153e-06, + "loss": 0.4855, + "num_input_tokens_seen": 9596744, + "step": 14705 + }, + { + "epoch": 8.673349056603774, + "grad_norm": 2.2228715419769287, + "learning_rate": 6.97551560916665e-06, + "loss": 0.3848, + "num_input_tokens_seen": 9600520, + "step": 14710 + }, + { + "epoch": 8.67629716981132, + "grad_norm": 2.2739105224609375, + "learning_rate": 6.973151957966875e-06, + "loss": 0.5385, + "num_input_tokens_seen": 9603432, + "step": 14715 + }, + { + "epoch": 8.679245283018869, + "grad_norm": 3.178424596786499, + "learning_rate": 6.970787784366616e-06, + "loss": 0.4285, + "num_input_tokens_seen": 9607208, + "step": 14720 + }, + { + "epoch": 8.682193396226415, + "grad_norm": 2.009650468826294, + "learning_rate": 6.968423088991797e-06, + "loss": 0.3787, + "num_input_tokens_seen": 9610088, + "step": 14725 + }, + { + "epoch": 8.685141509433961, + "grad_norm": 2.3175225257873535, + "learning_rate": 6.966057872468481e-06, + "loss": 0.399, + "num_input_tokens_seen": 9613288, + "step": 14730 + }, + { + "epoch": 8.68808962264151, + "grad_norm": 1.885446548461914, + "learning_rate": 6.963692135422872e-06, + "loss": 0.4141, + "num_input_tokens_seen": 9617928, + "step": 14735 + }, + { + "epoch": 8.691037735849056, + "grad_norm": 1.8774847984313965, + "learning_rate": 6.961325878481305e-06, + "loss": 0.3259, + "num_input_tokens_seen": 9620712, + "step": 14740 + }, + { + "epoch": 8.693985849056604, + "grad_norm": 2.97818922996521, + "learning_rate": 6.958959102270259e-06, + "loss": 0.5477, + "num_input_tokens_seen": 9623784, + "step": 14745 + }, + { + "epoch": 8.69693396226415, + "grad_norm": 1.4760496616363525, + "learning_rate": 6.95659180741635e-06, + "loss": 0.3008, + "num_input_tokens_seen": 9626856, + "step": 14750 + }, + { + "epoch": 8.699882075471699, + "grad_norm": 2.454509973526001, + "learning_rate": 6.954223994546326e-06, + "loss": 0.4244, + "num_input_tokens_seen": 9629800, + "step": 14755 + }, + { + "epoch": 8.702830188679245, + "grad_norm": 3.469494581222534, + "learning_rate": 6.951855664287077e-06, + "loss": 0.3438, + "num_input_tokens_seen": 9633384, + "step": 14760 + }, + { + "epoch": 8.705778301886792, + "grad_norm": 1.54270601272583, + "learning_rate": 6.9494868172656304e-06, + "loss": 0.3257, + "num_input_tokens_seen": 9636872, + "step": 14765 + }, + { + "epoch": 8.70872641509434, + "grad_norm": 2.034594774246216, + "learning_rate": 6.947117454109146e-06, + "loss": 0.3777, + "num_input_tokens_seen": 9639720, + "step": 14770 + }, + { + "epoch": 8.711674528301886, + "grad_norm": 2.6863253116607666, + "learning_rate": 6.944747575444924e-06, + "loss": 0.567, + "num_input_tokens_seen": 9642600, + "step": 14775 + }, + { + "epoch": 8.714622641509434, + "grad_norm": 3.1776413917541504, + "learning_rate": 6.942377181900399e-06, + "loss": 0.3753, + "num_input_tokens_seen": 9646568, + "step": 14780 + }, + { + "epoch": 8.71757075471698, + "grad_norm": 3.6220107078552246, + "learning_rate": 6.940006274103146e-06, + "loss": 0.3925, + "num_input_tokens_seen": 9649448, + "step": 14785 + }, + { + "epoch": 8.720518867924529, + "grad_norm": 2.513629913330078, + "learning_rate": 6.93763485268087e-06, + "loss": 0.4324, + "num_input_tokens_seen": 9652008, + "step": 14790 + }, + { + "epoch": 8.723466981132075, + "grad_norm": 2.0231757164001465, + "learning_rate": 6.935262918261416e-06, + "loss": 0.3284, + "num_input_tokens_seen": 9655048, + "step": 14795 + }, + { + "epoch": 8.726415094339622, + "grad_norm": 2.069519281387329, + "learning_rate": 6.932890471472764e-06, + "loss": 0.5942, + "num_input_tokens_seen": 9657960, + "step": 14800 + }, + { + "epoch": 8.72936320754717, + "grad_norm": 2.7226979732513428, + "learning_rate": 6.930517512943029e-06, + "loss": 0.4625, + "num_input_tokens_seen": 9661224, + "step": 14805 + }, + { + "epoch": 8.732311320754716, + "grad_norm": 2.512676239013672, + "learning_rate": 6.928144043300463e-06, + "loss": 0.2971, + "num_input_tokens_seen": 9664392, + "step": 14810 + }, + { + "epoch": 8.735259433962264, + "grad_norm": 2.156874418258667, + "learning_rate": 6.925770063173451e-06, + "loss": 0.5073, + "num_input_tokens_seen": 9668904, + "step": 14815 + }, + { + "epoch": 8.73820754716981, + "grad_norm": 2.5475564002990723, + "learning_rate": 6.923395573190514e-06, + "loss": 0.3598, + "num_input_tokens_seen": 9672424, + "step": 14820 + }, + { + "epoch": 8.741155660377359, + "grad_norm": 2.8670053482055664, + "learning_rate": 6.921020573980313e-06, + "loss": 0.3798, + "num_input_tokens_seen": 9675624, + "step": 14825 + }, + { + "epoch": 8.744103773584905, + "grad_norm": 2.135509729385376, + "learning_rate": 6.918645066171634e-06, + "loss": 0.3263, + "num_input_tokens_seen": 9678920, + "step": 14830 + }, + { + "epoch": 8.747051886792454, + "grad_norm": 4.993575572967529, + "learning_rate": 6.916269050393404e-06, + "loss": 0.3873, + "num_input_tokens_seen": 9681448, + "step": 14835 + }, + { + "epoch": 8.75, + "grad_norm": 2.099391222000122, + "learning_rate": 6.913892527274686e-06, + "loss": 0.5164, + "num_input_tokens_seen": 9684392, + "step": 14840 + }, + { + "epoch": 8.752948113207546, + "grad_norm": 6.596562385559082, + "learning_rate": 6.9115154974446716e-06, + "loss": 0.4245, + "num_input_tokens_seen": 9687720, + "step": 14845 + }, + { + "epoch": 8.755896226415095, + "grad_norm": 2.492990016937256, + "learning_rate": 6.909137961532692e-06, + "loss": 0.2978, + "num_input_tokens_seen": 9690632, + "step": 14850 + }, + { + "epoch": 8.758844339622641, + "grad_norm": 4.106266498565674, + "learning_rate": 6.906759920168209e-06, + "loss": 0.4541, + "num_input_tokens_seen": 9693544, + "step": 14855 + }, + { + "epoch": 8.76179245283019, + "grad_norm": 1.2404447793960571, + "learning_rate": 6.90438137398082e-06, + "loss": 0.382, + "num_input_tokens_seen": 9697992, + "step": 14860 + }, + { + "epoch": 8.764740566037736, + "grad_norm": 1.9230403900146484, + "learning_rate": 6.902002323600252e-06, + "loss": 0.3407, + "num_input_tokens_seen": 9700744, + "step": 14865 + }, + { + "epoch": 8.767688679245284, + "grad_norm": 5.1434526443481445, + "learning_rate": 6.899622769656373e-06, + "loss": 0.3362, + "num_input_tokens_seen": 9703752, + "step": 14870 + }, + { + "epoch": 8.77063679245283, + "grad_norm": 2.4574313163757324, + "learning_rate": 6.897242712779179e-06, + "loss": 0.3287, + "num_input_tokens_seen": 9707272, + "step": 14875 + }, + { + "epoch": 8.773584905660378, + "grad_norm": 1.8372465372085571, + "learning_rate": 6.894862153598802e-06, + "loss": 0.3483, + "num_input_tokens_seen": 9710280, + "step": 14880 + }, + { + "epoch": 8.776533018867925, + "grad_norm": 2.3414535522460938, + "learning_rate": 6.892481092745502e-06, + "loss": 0.4507, + "num_input_tokens_seen": 9714216, + "step": 14885 + }, + { + "epoch": 8.779481132075471, + "grad_norm": 2.5674240589141846, + "learning_rate": 6.890099530849677e-06, + "loss": 0.6275, + "num_input_tokens_seen": 9717224, + "step": 14890 + }, + { + "epoch": 8.78242924528302, + "grad_norm": 1.6274032592773438, + "learning_rate": 6.887717468541855e-06, + "loss": 0.4574, + "num_input_tokens_seen": 9720872, + "step": 14895 + }, + { + "epoch": 8.785377358490566, + "grad_norm": 3.696709394454956, + "learning_rate": 6.885334906452696e-06, + "loss": 0.4132, + "num_input_tokens_seen": 9723624, + "step": 14900 + }, + { + "epoch": 8.788325471698114, + "grad_norm": 4.480981826782227, + "learning_rate": 6.882951845212997e-06, + "loss": 0.3908, + "num_input_tokens_seen": 9726248, + "step": 14905 + }, + { + "epoch": 8.79127358490566, + "grad_norm": 2.964874744415283, + "learning_rate": 6.880568285453682e-06, + "loss": 0.4575, + "num_input_tokens_seen": 9728456, + "step": 14910 + }, + { + "epoch": 8.794221698113208, + "grad_norm": 2.3584275245666504, + "learning_rate": 6.878184227805807e-06, + "loss": 0.3126, + "num_input_tokens_seen": 9732360, + "step": 14915 + }, + { + "epoch": 8.797169811320755, + "grad_norm": 1.721100926399231, + "learning_rate": 6.8757996729005645e-06, + "loss": 0.3775, + "num_input_tokens_seen": 9736072, + "step": 14920 + }, + { + "epoch": 8.800117924528301, + "grad_norm": 2.1002752780914307, + "learning_rate": 6.8734146213692756e-06, + "loss": 0.356, + "num_input_tokens_seen": 9738856, + "step": 14925 + }, + { + "epoch": 8.80306603773585, + "grad_norm": 2.216747283935547, + "learning_rate": 6.87102907384339e-06, + "loss": 0.3749, + "num_input_tokens_seen": 9741992, + "step": 14930 + }, + { + "epoch": 8.806014150943396, + "grad_norm": 2.1929080486297607, + "learning_rate": 6.868643030954494e-06, + "loss": 0.4098, + "num_input_tokens_seen": 9745896, + "step": 14935 + }, + { + "epoch": 8.808962264150944, + "grad_norm": 1.8580706119537354, + "learning_rate": 6.866256493334302e-06, + "loss": 0.4265, + "num_input_tokens_seen": 9749576, + "step": 14940 + }, + { + "epoch": 8.81191037735849, + "grad_norm": 2.153745174407959, + "learning_rate": 6.863869461614659e-06, + "loss": 0.3347, + "num_input_tokens_seen": 9753672, + "step": 14945 + }, + { + "epoch": 8.814858490566039, + "grad_norm": 2.6148741245269775, + "learning_rate": 6.861481936427545e-06, + "loss": 0.3544, + "num_input_tokens_seen": 9757320, + "step": 14950 + }, + { + "epoch": 8.817806603773585, + "grad_norm": 4.132111549377441, + "learning_rate": 6.859093918405067e-06, + "loss": 0.3839, + "num_input_tokens_seen": 9760328, + "step": 14955 + }, + { + "epoch": 8.820754716981131, + "grad_norm": 4.385087490081787, + "learning_rate": 6.856705408179458e-06, + "loss": 0.5049, + "num_input_tokens_seen": 9763496, + "step": 14960 + }, + { + "epoch": 8.82370283018868, + "grad_norm": 1.8727986812591553, + "learning_rate": 6.854316406383093e-06, + "loss": 0.419, + "num_input_tokens_seen": 9767592, + "step": 14965 + }, + { + "epoch": 8.826650943396226, + "grad_norm": 2.9489119052886963, + "learning_rate": 6.8519269136484665e-06, + "loss": 0.4227, + "num_input_tokens_seen": 9770440, + "step": 14970 + }, + { + "epoch": 8.829599056603774, + "grad_norm": 3.0117952823638916, + "learning_rate": 6.849536930608208e-06, + "loss": 0.389, + "num_input_tokens_seen": 9772552, + "step": 14975 + }, + { + "epoch": 8.83254716981132, + "grad_norm": 1.9244282245635986, + "learning_rate": 6.847146457895078e-06, + "loss": 0.3432, + "num_input_tokens_seen": 9775752, + "step": 14980 + }, + { + "epoch": 8.835495283018869, + "grad_norm": 1.5959678888320923, + "learning_rate": 6.8447554961419615e-06, + "loss": 0.3584, + "num_input_tokens_seen": 9778760, + "step": 14985 + }, + { + "epoch": 8.838443396226415, + "grad_norm": 1.6212234497070312, + "learning_rate": 6.842364045981876e-06, + "loss": 0.313, + "num_input_tokens_seen": 9782280, + "step": 14990 + }, + { + "epoch": 8.841391509433961, + "grad_norm": 2.307650566101074, + "learning_rate": 6.83997210804797e-06, + "loss": 0.3883, + "num_input_tokens_seen": 9786440, + "step": 14995 + }, + { + "epoch": 8.84433962264151, + "grad_norm": 1.8725664615631104, + "learning_rate": 6.837579682973519e-06, + "loss": 0.4152, + "num_input_tokens_seen": 9789832, + "step": 15000 + }, + { + "epoch": 8.847287735849056, + "grad_norm": 2.428133249282837, + "learning_rate": 6.835186771391926e-06, + "loss": 0.2716, + "num_input_tokens_seen": 9792360, + "step": 15005 + }, + { + "epoch": 8.850235849056604, + "grad_norm": 3.086512327194214, + "learning_rate": 6.8327933739367266e-06, + "loss": 0.4118, + "num_input_tokens_seen": 9794824, + "step": 15010 + }, + { + "epoch": 8.85318396226415, + "grad_norm": 3.4672436714172363, + "learning_rate": 6.830399491241584e-06, + "loss": 0.2922, + "num_input_tokens_seen": 9798280, + "step": 15015 + }, + { + "epoch": 8.856132075471699, + "grad_norm": 1.7747465372085571, + "learning_rate": 6.828005123940287e-06, + "loss": 0.3109, + "num_input_tokens_seen": 9803720, + "step": 15020 + }, + { + "epoch": 8.859080188679245, + "grad_norm": 3.624042510986328, + "learning_rate": 6.825610272666754e-06, + "loss": 0.4407, + "num_input_tokens_seen": 9806504, + "step": 15025 + }, + { + "epoch": 8.862028301886792, + "grad_norm": 3.3068394660949707, + "learning_rate": 6.823214938055034e-06, + "loss": 0.4007, + "num_input_tokens_seen": 9809192, + "step": 15030 + }, + { + "epoch": 8.86497641509434, + "grad_norm": 2.7866663932800293, + "learning_rate": 6.8208191207393e-06, + "loss": 0.48, + "num_input_tokens_seen": 9812008, + "step": 15035 + }, + { + "epoch": 8.867924528301886, + "grad_norm": 1.5656015872955322, + "learning_rate": 6.818422821353859e-06, + "loss": 0.3435, + "num_input_tokens_seen": 9815464, + "step": 15040 + }, + { + "epoch": 8.870872641509434, + "grad_norm": 4.182625770568848, + "learning_rate": 6.816026040533139e-06, + "loss": 0.4625, + "num_input_tokens_seen": 9818888, + "step": 15045 + }, + { + "epoch": 8.87382075471698, + "grad_norm": 2.716383695602417, + "learning_rate": 6.8136287789116966e-06, + "loss": 0.3678, + "num_input_tokens_seen": 9822952, + "step": 15050 + }, + { + "epoch": 8.876768867924529, + "grad_norm": 1.5944288969039917, + "learning_rate": 6.81123103712422e-06, + "loss": 0.4056, + "num_input_tokens_seen": 9827144, + "step": 15055 + }, + { + "epoch": 8.879716981132075, + "grad_norm": 2.8928966522216797, + "learning_rate": 6.808832815805518e-06, + "loss": 0.3675, + "num_input_tokens_seen": 9830440, + "step": 15060 + }, + { + "epoch": 8.882665094339622, + "grad_norm": 2.9204976558685303, + "learning_rate": 6.806434115590534e-06, + "loss": 0.443, + "num_input_tokens_seen": 9833192, + "step": 15065 + }, + { + "epoch": 8.88561320754717, + "grad_norm": 2.4361791610717773, + "learning_rate": 6.804034937114332e-06, + "loss": 0.3941, + "num_input_tokens_seen": 9835880, + "step": 15070 + }, + { + "epoch": 8.888561320754716, + "grad_norm": 2.6314823627471924, + "learning_rate": 6.8016352810121064e-06, + "loss": 0.3446, + "num_input_tokens_seen": 9838248, + "step": 15075 + }, + { + "epoch": 8.891509433962264, + "grad_norm": 2.179109811782837, + "learning_rate": 6.799235147919176e-06, + "loss": 0.4545, + "num_input_tokens_seen": 9841480, + "step": 15080 + }, + { + "epoch": 8.89445754716981, + "grad_norm": 2.0048701763153076, + "learning_rate": 6.796834538470985e-06, + "loss": 0.4262, + "num_input_tokens_seen": 9844392, + "step": 15085 + }, + { + "epoch": 8.897405660377359, + "grad_norm": 1.8004496097564697, + "learning_rate": 6.794433453303106e-06, + "loss": 0.4196, + "num_input_tokens_seen": 9847848, + "step": 15090 + }, + { + "epoch": 8.900353773584905, + "grad_norm": 2.1938366889953613, + "learning_rate": 6.792031893051238e-06, + "loss": 0.4999, + "num_input_tokens_seen": 9851432, + "step": 15095 + }, + { + "epoch": 8.903301886792454, + "grad_norm": 3.094050168991089, + "learning_rate": 6.789629858351201e-06, + "loss": 0.3777, + "num_input_tokens_seen": 9854984, + "step": 15100 + }, + { + "epoch": 8.90625, + "grad_norm": 2.8745994567871094, + "learning_rate": 6.787227349838946e-06, + "loss": 0.4076, + "num_input_tokens_seen": 9857800, + "step": 15105 + }, + { + "epoch": 8.909198113207546, + "grad_norm": 2.2255163192749023, + "learning_rate": 6.784824368150548e-06, + "loss": 0.3255, + "num_input_tokens_seen": 9860296, + "step": 15110 + }, + { + "epoch": 8.912146226415095, + "grad_norm": 2.0890579223632812, + "learning_rate": 6.7824209139222076e-06, + "loss": 0.3604, + "num_input_tokens_seen": 9862984, + "step": 15115 + }, + { + "epoch": 8.915094339622641, + "grad_norm": 2.5852580070495605, + "learning_rate": 6.780016987790248e-06, + "loss": 0.3428, + "num_input_tokens_seen": 9866120, + "step": 15120 + }, + { + "epoch": 8.91804245283019, + "grad_norm": 2.4474847316741943, + "learning_rate": 6.7776125903911194e-06, + "loss": 0.4906, + "num_input_tokens_seen": 9870120, + "step": 15125 + }, + { + "epoch": 8.920990566037736, + "grad_norm": 3.444129705429077, + "learning_rate": 6.775207722361396e-06, + "loss": 0.5517, + "num_input_tokens_seen": 9872808, + "step": 15130 + }, + { + "epoch": 8.923938679245284, + "grad_norm": 2.122300624847412, + "learning_rate": 6.772802384337778e-06, + "loss": 0.4352, + "num_input_tokens_seen": 9876456, + "step": 15135 + }, + { + "epoch": 8.92688679245283, + "grad_norm": 3.071918249130249, + "learning_rate": 6.770396576957088e-06, + "loss": 0.2946, + "num_input_tokens_seen": 9878728, + "step": 15140 + }, + { + "epoch": 8.929834905660378, + "grad_norm": 1.8663431406021118, + "learning_rate": 6.767990300856274e-06, + "loss": 0.5341, + "num_input_tokens_seen": 9881704, + "step": 15145 + }, + { + "epoch": 8.932783018867925, + "grad_norm": 2.977050304412842, + "learning_rate": 6.765583556672408e-06, + "loss": 0.3521, + "num_input_tokens_seen": 9884392, + "step": 15150 + }, + { + "epoch": 8.935731132075471, + "grad_norm": 1.5496002435684204, + "learning_rate": 6.763176345042687e-06, + "loss": 0.2488, + "num_input_tokens_seen": 9887528, + "step": 15155 + }, + { + "epoch": 8.93867924528302, + "grad_norm": 5.135709762573242, + "learning_rate": 6.760768666604429e-06, + "loss": 0.6035, + "num_input_tokens_seen": 9890376, + "step": 15160 + }, + { + "epoch": 8.941627358490566, + "grad_norm": 4.098505973815918, + "learning_rate": 6.758360521995079e-06, + "loss": 0.4492, + "num_input_tokens_seen": 9893320, + "step": 15165 + }, + { + "epoch": 8.944575471698114, + "grad_norm": 2.8324337005615234, + "learning_rate": 6.755951911852202e-06, + "loss": 0.3756, + "num_input_tokens_seen": 9896232, + "step": 15170 + }, + { + "epoch": 8.94752358490566, + "grad_norm": 4.494495391845703, + "learning_rate": 6.7535428368134885e-06, + "loss": 0.4015, + "num_input_tokens_seen": 9899464, + "step": 15175 + }, + { + "epoch": 8.950471698113208, + "grad_norm": 2.36110520362854, + "learning_rate": 6.751133297516752e-06, + "loss": 0.5217, + "num_input_tokens_seen": 9902728, + "step": 15180 + }, + { + "epoch": 8.953419811320755, + "grad_norm": 2.2041213512420654, + "learning_rate": 6.748723294599928e-06, + "loss": 0.35, + "num_input_tokens_seen": 9906280, + "step": 15185 + }, + { + "epoch": 8.956367924528301, + "grad_norm": 1.1619752645492554, + "learning_rate": 6.746312828701075e-06, + "loss": 0.4208, + "num_input_tokens_seen": 9909960, + "step": 15190 + }, + { + "epoch": 8.95931603773585, + "grad_norm": 4.294003009796143, + "learning_rate": 6.743901900458374e-06, + "loss": 0.454, + "num_input_tokens_seen": 9912744, + "step": 15195 + }, + { + "epoch": 8.962264150943396, + "grad_norm": 2.167071580886841, + "learning_rate": 6.741490510510129e-06, + "loss": 0.3991, + "num_input_tokens_seen": 9915976, + "step": 15200 + }, + { + "epoch": 8.965212264150944, + "grad_norm": 2.4379770755767822, + "learning_rate": 6.7390786594947665e-06, + "loss": 0.2997, + "num_input_tokens_seen": 9918536, + "step": 15205 + }, + { + "epoch": 8.96816037735849, + "grad_norm": 2.330822229385376, + "learning_rate": 6.7366663480508335e-06, + "loss": 0.453, + "num_input_tokens_seen": 9921416, + "step": 15210 + }, + { + "epoch": 8.971108490566039, + "grad_norm": 1.5783472061157227, + "learning_rate": 6.734253576817002e-06, + "loss": 0.3343, + "num_input_tokens_seen": 9925544, + "step": 15215 + }, + { + "epoch": 8.974056603773585, + "grad_norm": 3.272941827774048, + "learning_rate": 6.731840346432061e-06, + "loss": 0.3874, + "num_input_tokens_seen": 9928936, + "step": 15220 + }, + { + "epoch": 8.977004716981131, + "grad_norm": 2.0508549213409424, + "learning_rate": 6.729426657534922e-06, + "loss": 0.4687, + "num_input_tokens_seen": 9931912, + "step": 15225 + }, + { + "epoch": 8.97995283018868, + "grad_norm": 3.4232406616210938, + "learning_rate": 6.727012510764624e-06, + "loss": 0.4709, + "num_input_tokens_seen": 9934984, + "step": 15230 + }, + { + "epoch": 8.982900943396226, + "grad_norm": 1.9945486783981323, + "learning_rate": 6.724597906760322e-06, + "loss": 0.4471, + "num_input_tokens_seen": 9937800, + "step": 15235 + }, + { + "epoch": 8.985849056603774, + "grad_norm": 2.603271007537842, + "learning_rate": 6.722182846161289e-06, + "loss": 0.5142, + "num_input_tokens_seen": 9940712, + "step": 15240 + }, + { + "epoch": 8.98879716981132, + "grad_norm": 1.8966410160064697, + "learning_rate": 6.719767329606926e-06, + "loss": 0.4572, + "num_input_tokens_seen": 9944584, + "step": 15245 + }, + { + "epoch": 8.991745283018869, + "grad_norm": 3.0709445476531982, + "learning_rate": 6.717351357736751e-06, + "loss": 0.5779, + "num_input_tokens_seen": 9947368, + "step": 15250 + }, + { + "epoch": 8.994693396226415, + "grad_norm": 1.5476329326629639, + "learning_rate": 6.7149349311904025e-06, + "loss": 0.3254, + "num_input_tokens_seen": 9950088, + "step": 15255 + }, + { + "epoch": 8.997641509433961, + "grad_norm": 3.120551347732544, + "learning_rate": 6.712518050607642e-06, + "loss": 0.3661, + "num_input_tokens_seen": 9953096, + "step": 15260 + }, + { + "epoch": 9.00058962264151, + "grad_norm": 4.0768208503723145, + "learning_rate": 6.710100716628345e-06, + "loss": 0.3108, + "num_input_tokens_seen": 9954992, + "step": 15265 + }, + { + "epoch": 9.003537735849056, + "grad_norm": 2.141944169998169, + "learning_rate": 6.707682929892513e-06, + "loss": 0.4337, + "num_input_tokens_seen": 9957808, + "step": 15270 + }, + { + "epoch": 9.006485849056604, + "grad_norm": 3.5433034896850586, + "learning_rate": 6.705264691040266e-06, + "loss": 0.3454, + "num_input_tokens_seen": 9961520, + "step": 15275 + }, + { + "epoch": 9.00943396226415, + "grad_norm": 3.2581379413604736, + "learning_rate": 6.7028460007118435e-06, + "loss": 0.3543, + "num_input_tokens_seen": 9964144, + "step": 15280 + }, + { + "epoch": 9.012382075471699, + "grad_norm": 2.69348406791687, + "learning_rate": 6.700426859547602e-06, + "loss": 0.2855, + "num_input_tokens_seen": 9967056, + "step": 15285 + }, + { + "epoch": 9.015330188679245, + "grad_norm": 1.4692714214324951, + "learning_rate": 6.6980072681880224e-06, + "loss": 0.3572, + "num_input_tokens_seen": 9970800, + "step": 15290 + }, + { + "epoch": 9.018278301886792, + "grad_norm": 3.3459842205047607, + "learning_rate": 6.695587227273699e-06, + "loss": 0.2975, + "num_input_tokens_seen": 9973392, + "step": 15295 + }, + { + "epoch": 9.02122641509434, + "grad_norm": 2.84409236907959, + "learning_rate": 6.69316673744535e-06, + "loss": 0.3454, + "num_input_tokens_seen": 9976176, + "step": 15300 + }, + { + "epoch": 9.024174528301886, + "grad_norm": 3.748802900314331, + "learning_rate": 6.6907457993438115e-06, + "loss": 0.4048, + "num_input_tokens_seen": 9979440, + "step": 15305 + }, + { + "epoch": 9.027122641509434, + "grad_norm": 3.250750780105591, + "learning_rate": 6.688324413610036e-06, + "loss": 0.4487, + "num_input_tokens_seen": 9982192, + "step": 15310 + }, + { + "epoch": 9.03007075471698, + "grad_norm": 2.582509756088257, + "learning_rate": 6.685902580885094e-06, + "loss": 0.2786, + "num_input_tokens_seen": 9984848, + "step": 15315 + }, + { + "epoch": 9.033018867924529, + "grad_norm": 4.707276821136475, + "learning_rate": 6.6834803018101794e-06, + "loss": 0.3064, + "num_input_tokens_seen": 9987568, + "step": 15320 + }, + { + "epoch": 9.035966981132075, + "grad_norm": 2.529352903366089, + "learning_rate": 6.681057577026599e-06, + "loss": 0.3202, + "num_input_tokens_seen": 9990736, + "step": 15325 + }, + { + "epoch": 9.038915094339623, + "grad_norm": 2.9558496475219727, + "learning_rate": 6.6786344071757795e-06, + "loss": 0.4524, + "num_input_tokens_seen": 9994000, + "step": 15330 + }, + { + "epoch": 9.04186320754717, + "grad_norm": 3.077704429626465, + "learning_rate": 6.676210792899267e-06, + "loss": 0.3168, + "num_input_tokens_seen": 9996272, + "step": 15335 + }, + { + "epoch": 9.044811320754716, + "grad_norm": 3.257589101791382, + "learning_rate": 6.6737867348387235e-06, + "loss": 0.3955, + "num_input_tokens_seen": 9999632, + "step": 15340 + }, + { + "epoch": 9.047759433962264, + "grad_norm": 2.5129952430725098, + "learning_rate": 6.671362233635926e-06, + "loss": 0.3593, + "num_input_tokens_seen": 10002512, + "step": 15345 + }, + { + "epoch": 9.05070754716981, + "grad_norm": 2.338437795639038, + "learning_rate": 6.668937289932775e-06, + "loss": 0.3448, + "num_input_tokens_seen": 10005136, + "step": 15350 + }, + { + "epoch": 9.053655660377359, + "grad_norm": 3.7598979473114014, + "learning_rate": 6.666511904371285e-06, + "loss": 0.2999, + "num_input_tokens_seen": 10008336, + "step": 15355 + }, + { + "epoch": 9.056603773584905, + "grad_norm": 3.086458206176758, + "learning_rate": 6.664086077593586e-06, + "loss": 0.2692, + "num_input_tokens_seen": 10011248, + "step": 15360 + }, + { + "epoch": 9.059551886792454, + "grad_norm": 4.133688926696777, + "learning_rate": 6.661659810241924e-06, + "loss": 0.5523, + "num_input_tokens_seen": 10014448, + "step": 15365 + }, + { + "epoch": 9.0625, + "grad_norm": 2.739734411239624, + "learning_rate": 6.659233102958667e-06, + "loss": 0.3496, + "num_input_tokens_seen": 10017616, + "step": 15370 + }, + { + "epoch": 9.065448113207546, + "grad_norm": 2.8032898902893066, + "learning_rate": 6.6568059563862965e-06, + "loss": 0.3469, + "num_input_tokens_seen": 10020432, + "step": 15375 + }, + { + "epoch": 9.068396226415095, + "grad_norm": 2.7908413410186768, + "learning_rate": 6.654378371167407e-06, + "loss": 0.385, + "num_input_tokens_seen": 10024624, + "step": 15380 + }, + { + "epoch": 9.071344339622641, + "grad_norm": 1.3122307062149048, + "learning_rate": 6.651950347944715e-06, + "loss": 0.3473, + "num_input_tokens_seen": 10029040, + "step": 15385 + }, + { + "epoch": 9.07429245283019, + "grad_norm": 3.0658631324768066, + "learning_rate": 6.649521887361048e-06, + "loss": 0.3784, + "num_input_tokens_seen": 10034064, + "step": 15390 + }, + { + "epoch": 9.077240566037736, + "grad_norm": 2.026421070098877, + "learning_rate": 6.647092990059352e-06, + "loss": 0.4726, + "num_input_tokens_seen": 10037200, + "step": 15395 + }, + { + "epoch": 9.080188679245284, + "grad_norm": 1.9497674703598022, + "learning_rate": 6.644663656682689e-06, + "loss": 0.4033, + "num_input_tokens_seen": 10040368, + "step": 15400 + }, + { + "epoch": 9.08313679245283, + "grad_norm": 2.7108278274536133, + "learning_rate": 6.642233887874234e-06, + "loss": 0.3821, + "num_input_tokens_seen": 10044208, + "step": 15405 + }, + { + "epoch": 9.086084905660377, + "grad_norm": 5.38900899887085, + "learning_rate": 6.639803684277279e-06, + "loss": 0.4293, + "num_input_tokens_seen": 10047120, + "step": 15410 + }, + { + "epoch": 9.089033018867925, + "grad_norm": 4.527899265289307, + "learning_rate": 6.637373046535233e-06, + "loss": 0.3601, + "num_input_tokens_seen": 10050288, + "step": 15415 + }, + { + "epoch": 9.091981132075471, + "grad_norm": 4.853857517242432, + "learning_rate": 6.634941975291617e-06, + "loss": 0.4904, + "num_input_tokens_seen": 10054480, + "step": 15420 + }, + { + "epoch": 9.09492924528302, + "grad_norm": 2.3710391521453857, + "learning_rate": 6.632510471190065e-06, + "loss": 0.3969, + "num_input_tokens_seen": 10057616, + "step": 15425 + }, + { + "epoch": 9.097877358490566, + "grad_norm": 1.9033665657043457, + "learning_rate": 6.630078534874332e-06, + "loss": 0.3054, + "num_input_tokens_seen": 10060144, + "step": 15430 + }, + { + "epoch": 9.100825471698114, + "grad_norm": 2.2946364879608154, + "learning_rate": 6.62764616698828e-06, + "loss": 0.433, + "num_input_tokens_seen": 10062928, + "step": 15435 + }, + { + "epoch": 9.10377358490566, + "grad_norm": 2.050767660140991, + "learning_rate": 6.625213368175889e-06, + "loss": 0.3988, + "num_input_tokens_seen": 10065840, + "step": 15440 + }, + { + "epoch": 9.106721698113208, + "grad_norm": 2.644279956817627, + "learning_rate": 6.622780139081256e-06, + "loss": 0.3401, + "num_input_tokens_seen": 10068912, + "step": 15445 + }, + { + "epoch": 9.109669811320755, + "grad_norm": 2.5647025108337402, + "learning_rate": 6.620346480348589e-06, + "loss": 0.4874, + "num_input_tokens_seen": 10071728, + "step": 15450 + }, + { + "epoch": 9.112617924528301, + "grad_norm": 3.909867525100708, + "learning_rate": 6.617912392622206e-06, + "loss": 0.3906, + "num_input_tokens_seen": 10074672, + "step": 15455 + }, + { + "epoch": 9.11556603773585, + "grad_norm": 1.249127984046936, + "learning_rate": 6.615477876546544e-06, + "loss": 0.2949, + "num_input_tokens_seen": 10078160, + "step": 15460 + }, + { + "epoch": 9.118514150943396, + "grad_norm": 4.260204315185547, + "learning_rate": 6.6130429327661535e-06, + "loss": 0.3605, + "num_input_tokens_seen": 10080912, + "step": 15465 + }, + { + "epoch": 9.121462264150944, + "grad_norm": 1.778310775756836, + "learning_rate": 6.610607561925694e-06, + "loss": 0.3987, + "num_input_tokens_seen": 10084048, + "step": 15470 + }, + { + "epoch": 9.12441037735849, + "grad_norm": 1.6034519672393799, + "learning_rate": 6.608171764669941e-06, + "loss": 0.36, + "num_input_tokens_seen": 10087312, + "step": 15475 + }, + { + "epoch": 9.127358490566039, + "grad_norm": 3.438253402709961, + "learning_rate": 6.605735541643783e-06, + "loss": 0.4232, + "num_input_tokens_seen": 10091184, + "step": 15480 + }, + { + "epoch": 9.130306603773585, + "grad_norm": 3.0842971801757812, + "learning_rate": 6.603298893492219e-06, + "loss": 0.3992, + "num_input_tokens_seen": 10096400, + "step": 15485 + }, + { + "epoch": 9.133254716981131, + "grad_norm": 3.1788229942321777, + "learning_rate": 6.6008618208603625e-06, + "loss": 0.38, + "num_input_tokens_seen": 10099216, + "step": 15490 + }, + { + "epoch": 9.13620283018868, + "grad_norm": 2.855682373046875, + "learning_rate": 6.598424324393442e-06, + "loss": 0.3689, + "num_input_tokens_seen": 10102224, + "step": 15495 + }, + { + "epoch": 9.139150943396226, + "grad_norm": 2.0665063858032227, + "learning_rate": 6.595986404736792e-06, + "loss": 0.4944, + "num_input_tokens_seen": 10105904, + "step": 15500 + }, + { + "epoch": 9.142099056603774, + "grad_norm": 3.4416561126708984, + "learning_rate": 6.5935480625358615e-06, + "loss": 0.3883, + "num_input_tokens_seen": 10108976, + "step": 15505 + }, + { + "epoch": 9.14504716981132, + "grad_norm": 3.0720560550689697, + "learning_rate": 6.591109298436216e-06, + "loss": 0.3664, + "num_input_tokens_seen": 10112048, + "step": 15510 + }, + { + "epoch": 9.147995283018869, + "grad_norm": 2.3089096546173096, + "learning_rate": 6.5886701130835255e-06, + "loss": 0.336, + "num_input_tokens_seen": 10115344, + "step": 15515 + }, + { + "epoch": 9.150943396226415, + "grad_norm": 2.373603105545044, + "learning_rate": 6.586230507123574e-06, + "loss": 0.2463, + "num_input_tokens_seen": 10118288, + "step": 15520 + }, + { + "epoch": 9.153891509433961, + "grad_norm": 4.4341301918029785, + "learning_rate": 6.583790481202261e-06, + "loss": 0.4199, + "num_input_tokens_seen": 10122000, + "step": 15525 + }, + { + "epoch": 9.15683962264151, + "grad_norm": 2.1818697452545166, + "learning_rate": 6.5813500359655925e-06, + "loss": 0.4549, + "num_input_tokens_seen": 10124976, + "step": 15530 + }, + { + "epoch": 9.159787735849056, + "grad_norm": 2.5069644451141357, + "learning_rate": 6.578909172059687e-06, + "loss": 0.3342, + "num_input_tokens_seen": 10129424, + "step": 15535 + }, + { + "epoch": 9.162735849056604, + "grad_norm": 2.1268208026885986, + "learning_rate": 6.576467890130772e-06, + "loss": 0.5142, + "num_input_tokens_seen": 10132336, + "step": 15540 + }, + { + "epoch": 9.16568396226415, + "grad_norm": 1.7799327373504639, + "learning_rate": 6.574026190825191e-06, + "loss": 0.3993, + "num_input_tokens_seen": 10135824, + "step": 15545 + }, + { + "epoch": 9.168632075471699, + "grad_norm": 2.2952194213867188, + "learning_rate": 6.57158407478939e-06, + "loss": 0.584, + "num_input_tokens_seen": 10139120, + "step": 15550 + }, + { + "epoch": 9.171580188679245, + "grad_norm": 2.8885700702667236, + "learning_rate": 6.569141542669935e-06, + "loss": 0.3488, + "num_input_tokens_seen": 10141584, + "step": 15555 + }, + { + "epoch": 9.174528301886792, + "grad_norm": 3.753981351852417, + "learning_rate": 6.566698595113492e-06, + "loss": 0.4841, + "num_input_tokens_seen": 10144048, + "step": 15560 + }, + { + "epoch": 9.17747641509434, + "grad_norm": 1.7688887119293213, + "learning_rate": 6.564255232766843e-06, + "loss": 0.4002, + "num_input_tokens_seen": 10147568, + "step": 15565 + }, + { + "epoch": 9.180424528301886, + "grad_norm": 1.7112520933151245, + "learning_rate": 6.561811456276881e-06, + "loss": 0.3668, + "num_input_tokens_seen": 10151088, + "step": 15570 + }, + { + "epoch": 9.183372641509434, + "grad_norm": 2.597309112548828, + "learning_rate": 6.559367266290605e-06, + "loss": 0.4924, + "num_input_tokens_seen": 10154352, + "step": 15575 + }, + { + "epoch": 9.18632075471698, + "grad_norm": 1.7510815858840942, + "learning_rate": 6.556922663455123e-06, + "loss": 0.4176, + "num_input_tokens_seen": 10157776, + "step": 15580 + }, + { + "epoch": 9.189268867924529, + "grad_norm": 3.1420514583587646, + "learning_rate": 6.554477648417657e-06, + "loss": 0.2943, + "num_input_tokens_seen": 10160528, + "step": 15585 + }, + { + "epoch": 9.192216981132075, + "grad_norm": 1.717568039894104, + "learning_rate": 6.552032221825535e-06, + "loss": 0.439, + "num_input_tokens_seen": 10164080, + "step": 15590 + }, + { + "epoch": 9.195165094339623, + "grad_norm": 1.507290005683899, + "learning_rate": 6.549586384326192e-06, + "loss": 0.3956, + "num_input_tokens_seen": 10167600, + "step": 15595 + }, + { + "epoch": 9.19811320754717, + "grad_norm": 4.158390045166016, + "learning_rate": 6.547140136567176e-06, + "loss": 0.3982, + "num_input_tokens_seen": 10170512, + "step": 15600 + }, + { + "epoch": 9.201061320754716, + "grad_norm": 1.8670555353164673, + "learning_rate": 6.544693479196142e-06, + "loss": 0.3391, + "num_input_tokens_seen": 10174928, + "step": 15605 + }, + { + "epoch": 9.204009433962264, + "grad_norm": 2.9960825443267822, + "learning_rate": 6.542246412860851e-06, + "loss": 0.44, + "num_input_tokens_seen": 10177552, + "step": 15610 + }, + { + "epoch": 9.20695754716981, + "grad_norm": 2.3815324306488037, + "learning_rate": 6.5397989382091754e-06, + "loss": 0.5209, + "num_input_tokens_seen": 10181840, + "step": 15615 + }, + { + "epoch": 9.209905660377359, + "grad_norm": 2.127275228500366, + "learning_rate": 6.537351055889096e-06, + "loss": 0.2597, + "num_input_tokens_seen": 10184880, + "step": 15620 + }, + { + "epoch": 9.212853773584905, + "grad_norm": 3.5977749824523926, + "learning_rate": 6.534902766548698e-06, + "loss": 0.3355, + "num_input_tokens_seen": 10188272, + "step": 15625 + }, + { + "epoch": 9.215801886792454, + "grad_norm": 2.0789361000061035, + "learning_rate": 6.532454070836176e-06, + "loss": 0.3663, + "num_input_tokens_seen": 10190704, + "step": 15630 + }, + { + "epoch": 9.21875, + "grad_norm": 2.914146900177002, + "learning_rate": 6.530004969399836e-06, + "loss": 0.3209, + "num_input_tokens_seen": 10194096, + "step": 15635 + }, + { + "epoch": 9.221698113207546, + "grad_norm": 2.6274631023406982, + "learning_rate": 6.5275554628880865e-06, + "loss": 0.2777, + "num_input_tokens_seen": 10197616, + "step": 15640 + }, + { + "epoch": 9.224646226415095, + "grad_norm": 2.249664068222046, + "learning_rate": 6.525105551949444e-06, + "loss": 0.4667, + "num_input_tokens_seen": 10201392, + "step": 15645 + }, + { + "epoch": 9.227594339622641, + "grad_norm": 4.6555094718933105, + "learning_rate": 6.522655237232535e-06, + "loss": 0.4882, + "num_input_tokens_seen": 10204304, + "step": 15650 + }, + { + "epoch": 9.23054245283019, + "grad_norm": 3.014333724975586, + "learning_rate": 6.5202045193860885e-06, + "loss": 0.3778, + "num_input_tokens_seen": 10207280, + "step": 15655 + }, + { + "epoch": 9.233490566037736, + "grad_norm": 2.7347233295440674, + "learning_rate": 6.517753399058944e-06, + "loss": 0.3208, + "num_input_tokens_seen": 10209904, + "step": 15660 + }, + { + "epoch": 9.236438679245284, + "grad_norm": 2.4949793815612793, + "learning_rate": 6.515301876900047e-06, + "loss": 0.6337, + "num_input_tokens_seen": 10213360, + "step": 15665 + }, + { + "epoch": 9.23938679245283, + "grad_norm": 3.1196861267089844, + "learning_rate": 6.512849953558445e-06, + "loss": 0.4203, + "num_input_tokens_seen": 10216496, + "step": 15670 + }, + { + "epoch": 9.242334905660377, + "grad_norm": 1.582646131515503, + "learning_rate": 6.510397629683301e-06, + "loss": 0.3568, + "num_input_tokens_seen": 10219984, + "step": 15675 + }, + { + "epoch": 9.245283018867925, + "grad_norm": 2.570546865463257, + "learning_rate": 6.507944905923872e-06, + "loss": 0.3507, + "num_input_tokens_seen": 10223280, + "step": 15680 + }, + { + "epoch": 9.248231132075471, + "grad_norm": 2.599398612976074, + "learning_rate": 6.505491782929531e-06, + "loss": 0.3952, + "num_input_tokens_seen": 10226640, + "step": 15685 + }, + { + "epoch": 9.25117924528302, + "grad_norm": 1.7579734325408936, + "learning_rate": 6.5030382613497535e-06, + "loss": 0.3722, + "num_input_tokens_seen": 10230288, + "step": 15690 + }, + { + "epoch": 9.254127358490566, + "grad_norm": 5.006028175354004, + "learning_rate": 6.500584341834119e-06, + "loss": 0.4178, + "num_input_tokens_seen": 10233136, + "step": 15695 + }, + { + "epoch": 9.257075471698114, + "grad_norm": 3.505951404571533, + "learning_rate": 6.498130025032312e-06, + "loss": 0.3248, + "num_input_tokens_seen": 10235792, + "step": 15700 + }, + { + "epoch": 9.26002358490566, + "grad_norm": 2.3119189739227295, + "learning_rate": 6.495675311594123e-06, + "loss": 0.3616, + "num_input_tokens_seen": 10239088, + "step": 15705 + }, + { + "epoch": 9.262971698113208, + "grad_norm": 2.6068005561828613, + "learning_rate": 6.493220202169452e-06, + "loss": 0.3967, + "num_input_tokens_seen": 10242320, + "step": 15710 + }, + { + "epoch": 9.265919811320755, + "grad_norm": 2.2857918739318848, + "learning_rate": 6.490764697408295e-06, + "loss": 0.3508, + "num_input_tokens_seen": 10246160, + "step": 15715 + }, + { + "epoch": 9.268867924528301, + "grad_norm": 0.8318555951118469, + "learning_rate": 6.48830879796076e-06, + "loss": 0.2591, + "num_input_tokens_seen": 10251632, + "step": 15720 + }, + { + "epoch": 9.27181603773585, + "grad_norm": 2.582273483276367, + "learning_rate": 6.4858525044770546e-06, + "loss": 0.385, + "num_input_tokens_seen": 10254224, + "step": 15725 + }, + { + "epoch": 9.274764150943396, + "grad_norm": 3.2986502647399902, + "learning_rate": 6.483395817607497e-06, + "loss": 0.3786, + "num_input_tokens_seen": 10256976, + "step": 15730 + }, + { + "epoch": 9.277712264150944, + "grad_norm": 3.8442792892456055, + "learning_rate": 6.480938738002504e-06, + "loss": 0.3651, + "num_input_tokens_seen": 10260368, + "step": 15735 + }, + { + "epoch": 9.28066037735849, + "grad_norm": 2.140795946121216, + "learning_rate": 6.478481266312597e-06, + "loss": 0.37, + "num_input_tokens_seen": 10263920, + "step": 15740 + }, + { + "epoch": 9.283608490566039, + "grad_norm": 1.9786944389343262, + "learning_rate": 6.476023403188403e-06, + "loss": 0.357, + "num_input_tokens_seen": 10268048, + "step": 15745 + }, + { + "epoch": 9.286556603773585, + "grad_norm": 1.7013970613479614, + "learning_rate": 6.473565149280651e-06, + "loss": 0.4381, + "num_input_tokens_seen": 10271280, + "step": 15750 + }, + { + "epoch": 9.289504716981131, + "grad_norm": 1.8829222917556763, + "learning_rate": 6.471106505240175e-06, + "loss": 0.4675, + "num_input_tokens_seen": 10274256, + "step": 15755 + }, + { + "epoch": 9.29245283018868, + "grad_norm": 2.178178071975708, + "learning_rate": 6.468647471717914e-06, + "loss": 0.384, + "num_input_tokens_seen": 10276784, + "step": 15760 + }, + { + "epoch": 9.295400943396226, + "grad_norm": 4.0366997718811035, + "learning_rate": 6.466188049364902e-06, + "loss": 0.4048, + "num_input_tokens_seen": 10279024, + "step": 15765 + }, + { + "epoch": 9.298349056603774, + "grad_norm": 2.3506648540496826, + "learning_rate": 6.463728238832288e-06, + "loss": 0.4125, + "num_input_tokens_seen": 10282352, + "step": 15770 + }, + { + "epoch": 9.30129716981132, + "grad_norm": 1.6107803583145142, + "learning_rate": 6.461268040771311e-06, + "loss": 0.31, + "num_input_tokens_seen": 10285104, + "step": 15775 + }, + { + "epoch": 9.304245283018869, + "grad_norm": 2.6596596240997314, + "learning_rate": 6.458807455833326e-06, + "loss": 0.4125, + "num_input_tokens_seen": 10287824, + "step": 15780 + }, + { + "epoch": 9.307193396226415, + "grad_norm": 1.9756290912628174, + "learning_rate": 6.456346484669778e-06, + "loss": 0.4139, + "num_input_tokens_seen": 10292016, + "step": 15785 + }, + { + "epoch": 9.310141509433961, + "grad_norm": 2.5869123935699463, + "learning_rate": 6.4538851279322225e-06, + "loss": 0.3732, + "num_input_tokens_seen": 10295568, + "step": 15790 + }, + { + "epoch": 9.31308962264151, + "grad_norm": 2.1338016986846924, + "learning_rate": 6.451423386272312e-06, + "loss": 0.3142, + "num_input_tokens_seen": 10302928, + "step": 15795 + }, + { + "epoch": 9.316037735849056, + "grad_norm": 1.8033570051193237, + "learning_rate": 6.448961260341806e-06, + "loss": 0.4111, + "num_input_tokens_seen": 10306544, + "step": 15800 + }, + { + "epoch": 9.318985849056604, + "grad_norm": 1.202943205833435, + "learning_rate": 6.446498750792563e-06, + "loss": 0.3325, + "num_input_tokens_seen": 10309424, + "step": 15805 + }, + { + "epoch": 9.32193396226415, + "grad_norm": 4.276802062988281, + "learning_rate": 6.444035858276538e-06, + "loss": 0.3642, + "num_input_tokens_seen": 10312496, + "step": 15810 + }, + { + "epoch": 9.324882075471699, + "grad_norm": 4.620153427124023, + "learning_rate": 6.441572583445799e-06, + "loss": 0.3718, + "num_input_tokens_seen": 10315376, + "step": 15815 + }, + { + "epoch": 9.327830188679245, + "grad_norm": 2.3456342220306396, + "learning_rate": 6.439108926952504e-06, + "loss": 0.3244, + "num_input_tokens_seen": 10318544, + "step": 15820 + }, + { + "epoch": 9.330778301886792, + "grad_norm": 2.2263245582580566, + "learning_rate": 6.43664488944892e-06, + "loss": 0.4835, + "num_input_tokens_seen": 10321904, + "step": 15825 + }, + { + "epoch": 9.33372641509434, + "grad_norm": 2.76908278465271, + "learning_rate": 6.434180471587409e-06, + "loss": 0.3416, + "num_input_tokens_seen": 10325392, + "step": 15830 + }, + { + "epoch": 9.336674528301886, + "grad_norm": 2.044100284576416, + "learning_rate": 6.431715674020438e-06, + "loss": 0.3338, + "num_input_tokens_seen": 10328304, + "step": 15835 + }, + { + "epoch": 9.339622641509434, + "grad_norm": 2.7489852905273438, + "learning_rate": 6.429250497400571e-06, + "loss": 0.4561, + "num_input_tokens_seen": 10334224, + "step": 15840 + }, + { + "epoch": 9.34257075471698, + "grad_norm": 7.595127582550049, + "learning_rate": 6.426784942380475e-06, + "loss": 0.501, + "num_input_tokens_seen": 10337200, + "step": 15845 + }, + { + "epoch": 9.345518867924529, + "grad_norm": 12.796364784240723, + "learning_rate": 6.424319009612917e-06, + "loss": 0.3969, + "num_input_tokens_seen": 10340560, + "step": 15850 + }, + { + "epoch": 9.348466981132075, + "grad_norm": 2.534548759460449, + "learning_rate": 6.421852699750763e-06, + "loss": 0.3936, + "num_input_tokens_seen": 10343856, + "step": 15855 + }, + { + "epoch": 9.351415094339623, + "grad_norm": 2.4600634574890137, + "learning_rate": 6.4193860134469775e-06, + "loss": 0.3257, + "num_input_tokens_seen": 10346224, + "step": 15860 + }, + { + "epoch": 9.35436320754717, + "grad_norm": 2.678802013397217, + "learning_rate": 6.416918951354629e-06, + "loss": 0.3133, + "num_input_tokens_seen": 10349296, + "step": 15865 + }, + { + "epoch": 9.357311320754716, + "grad_norm": 2.17466402053833, + "learning_rate": 6.4144515141268816e-06, + "loss": 0.2675, + "num_input_tokens_seen": 10352176, + "step": 15870 + }, + { + "epoch": 9.360259433962264, + "grad_norm": 2.9604885578155518, + "learning_rate": 6.411983702416999e-06, + "loss": 0.3042, + "num_input_tokens_seen": 10355472, + "step": 15875 + }, + { + "epoch": 9.36320754716981, + "grad_norm": 4.104915618896484, + "learning_rate": 6.409515516878346e-06, + "loss": 0.3803, + "num_input_tokens_seen": 10358608, + "step": 15880 + }, + { + "epoch": 9.366155660377359, + "grad_norm": 4.079616546630859, + "learning_rate": 6.407046958164387e-06, + "loss": 0.4691, + "num_input_tokens_seen": 10362000, + "step": 15885 + }, + { + "epoch": 9.369103773584905, + "grad_norm": 2.215550184249878, + "learning_rate": 6.404578026928679e-06, + "loss": 0.4357, + "num_input_tokens_seen": 10364784, + "step": 15890 + }, + { + "epoch": 9.372051886792454, + "grad_norm": 3.5875542163848877, + "learning_rate": 6.402108723824887e-06, + "loss": 0.4585, + "num_input_tokens_seen": 10367760, + "step": 15895 + }, + { + "epoch": 9.375, + "grad_norm": 1.7194088697433472, + "learning_rate": 6.399639049506767e-06, + "loss": 0.3507, + "num_input_tokens_seen": 10374096, + "step": 15900 + }, + { + "epoch": 9.377948113207546, + "grad_norm": 5.651604652404785, + "learning_rate": 6.397169004628177e-06, + "loss": 0.4052, + "num_input_tokens_seen": 10377168, + "step": 15905 + }, + { + "epoch": 9.380896226415095, + "grad_norm": 2.679089069366455, + "learning_rate": 6.39469858984307e-06, + "loss": 0.5029, + "num_input_tokens_seen": 10380400, + "step": 15910 + }, + { + "epoch": 9.383844339622641, + "grad_norm": 2.7768306732177734, + "learning_rate": 6.3922278058055024e-06, + "loss": 0.3237, + "num_input_tokens_seen": 10383248, + "step": 15915 + }, + { + "epoch": 9.38679245283019, + "grad_norm": 1.9655019044876099, + "learning_rate": 6.389756653169622e-06, + "loss": 0.3945, + "num_input_tokens_seen": 10386320, + "step": 15920 + }, + { + "epoch": 9.389740566037736, + "grad_norm": 2.027432680130005, + "learning_rate": 6.387285132589678e-06, + "loss": 0.3749, + "num_input_tokens_seen": 10389424, + "step": 15925 + }, + { + "epoch": 9.392688679245284, + "grad_norm": 2.1318700313568115, + "learning_rate": 6.3848132447200166e-06, + "loss": 0.3635, + "num_input_tokens_seen": 10393072, + "step": 15930 + }, + { + "epoch": 9.39563679245283, + "grad_norm": 2.002270221710205, + "learning_rate": 6.38234099021508e-06, + "loss": 0.3938, + "num_input_tokens_seen": 10396304, + "step": 15935 + }, + { + "epoch": 9.398584905660377, + "grad_norm": 2.3423099517822266, + "learning_rate": 6.379868369729409e-06, + "loss": 0.333, + "num_input_tokens_seen": 10399664, + "step": 15940 + }, + { + "epoch": 9.401533018867925, + "grad_norm": 2.790731191635132, + "learning_rate": 6.377395383917642e-06, + "loss": 0.4399, + "num_input_tokens_seen": 10402896, + "step": 15945 + }, + { + "epoch": 9.404481132075471, + "grad_norm": 2.076799154281616, + "learning_rate": 6.374922033434507e-06, + "loss": 0.3293, + "num_input_tokens_seen": 10405296, + "step": 15950 + }, + { + "epoch": 9.40742924528302, + "grad_norm": 2.4376301765441895, + "learning_rate": 6.372448318934842e-06, + "loss": 0.4693, + "num_input_tokens_seen": 10408496, + "step": 15955 + }, + { + "epoch": 9.410377358490566, + "grad_norm": 2.3520314693450928, + "learning_rate": 6.369974241073569e-06, + "loss": 0.3165, + "num_input_tokens_seen": 10411888, + "step": 15960 + }, + { + "epoch": 9.413325471698114, + "grad_norm": 2.0389559268951416, + "learning_rate": 6.367499800505709e-06, + "loss": 0.36, + "num_input_tokens_seen": 10415216, + "step": 15965 + }, + { + "epoch": 9.41627358490566, + "grad_norm": 3.260810613632202, + "learning_rate": 6.365024997886384e-06, + "loss": 0.3442, + "num_input_tokens_seen": 10418288, + "step": 15970 + }, + { + "epoch": 9.419221698113208, + "grad_norm": 2.6975209712982178, + "learning_rate": 6.362549833870808e-06, + "loss": 0.3624, + "num_input_tokens_seen": 10420912, + "step": 15975 + }, + { + "epoch": 9.422169811320755, + "grad_norm": 1.567344069480896, + "learning_rate": 6.360074309114293e-06, + "loss": 0.3843, + "num_input_tokens_seen": 10423408, + "step": 15980 + }, + { + "epoch": 9.425117924528301, + "grad_norm": 2.631965398788452, + "learning_rate": 6.357598424272241e-06, + "loss": 0.3235, + "num_input_tokens_seen": 10427792, + "step": 15985 + }, + { + "epoch": 9.42806603773585, + "grad_norm": 3.1568689346313477, + "learning_rate": 6.355122180000156e-06, + "loss": 0.4691, + "num_input_tokens_seen": 10430928, + "step": 15990 + }, + { + "epoch": 9.431014150943396, + "grad_norm": 3.0079305171966553, + "learning_rate": 6.352645576953635e-06, + "loss": 0.4132, + "num_input_tokens_seen": 10434704, + "step": 15995 + }, + { + "epoch": 9.433962264150944, + "grad_norm": 1.860533595085144, + "learning_rate": 6.350168615788366e-06, + "loss": 0.3727, + "num_input_tokens_seen": 10438096, + "step": 16000 + }, + { + "epoch": 9.43691037735849, + "grad_norm": 2.781571865081787, + "learning_rate": 6.347691297160137e-06, + "loss": 0.3399, + "num_input_tokens_seen": 10441936, + "step": 16005 + }, + { + "epoch": 9.439858490566039, + "grad_norm": 4.923755168914795, + "learning_rate": 6.34521362172483e-06, + "loss": 0.3689, + "num_input_tokens_seen": 10445552, + "step": 16010 + }, + { + "epoch": 9.442806603773585, + "grad_norm": 2.797358512878418, + "learning_rate": 6.342735590138417e-06, + "loss": 0.3888, + "num_input_tokens_seen": 10448848, + "step": 16015 + }, + { + "epoch": 9.445754716981131, + "grad_norm": 2.8626868724823, + "learning_rate": 6.340257203056972e-06, + "loss": 0.4097, + "num_input_tokens_seen": 10452592, + "step": 16020 + }, + { + "epoch": 9.44870283018868, + "grad_norm": 2.9776132106781006, + "learning_rate": 6.3377784611366554e-06, + "loss": 0.439, + "num_input_tokens_seen": 10456272, + "step": 16025 + }, + { + "epoch": 9.451650943396226, + "grad_norm": 3.118195056915283, + "learning_rate": 6.335299365033726e-06, + "loss": 0.4599, + "num_input_tokens_seen": 10460080, + "step": 16030 + }, + { + "epoch": 9.454599056603774, + "grad_norm": 2.9336352348327637, + "learning_rate": 6.3328199154045346e-06, + "loss": 0.3853, + "num_input_tokens_seen": 10463248, + "step": 16035 + }, + { + "epoch": 9.45754716981132, + "grad_norm": 1.3094220161437988, + "learning_rate": 6.3303401129055265e-06, + "loss": 0.4637, + "num_input_tokens_seen": 10466896, + "step": 16040 + }, + { + "epoch": 9.460495283018869, + "grad_norm": 1.9588351249694824, + "learning_rate": 6.32785995819324e-06, + "loss": 0.3629, + "num_input_tokens_seen": 10470128, + "step": 16045 + }, + { + "epoch": 9.463443396226415, + "grad_norm": 2.59356951713562, + "learning_rate": 6.3253794519243075e-06, + "loss": 0.4269, + "num_input_tokens_seen": 10473840, + "step": 16050 + }, + { + "epoch": 9.466391509433961, + "grad_norm": 2.6218159198760986, + "learning_rate": 6.322898594755452e-06, + "loss": 0.3994, + "num_input_tokens_seen": 10477424, + "step": 16055 + }, + { + "epoch": 9.46933962264151, + "grad_norm": 1.5984671115875244, + "learning_rate": 6.320417387343492e-06, + "loss": 0.3488, + "num_input_tokens_seen": 10480688, + "step": 16060 + }, + { + "epoch": 9.472287735849056, + "grad_norm": 2.4238035678863525, + "learning_rate": 6.3179358303453386e-06, + "loss": 0.357, + "num_input_tokens_seen": 10484720, + "step": 16065 + }, + { + "epoch": 9.475235849056604, + "grad_norm": 2.3635451793670654, + "learning_rate": 6.315453924417995e-06, + "loss": 0.3579, + "num_input_tokens_seen": 10487952, + "step": 16070 + }, + { + "epoch": 9.47818396226415, + "grad_norm": 2.6382174491882324, + "learning_rate": 6.312971670218554e-06, + "loss": 0.3065, + "num_input_tokens_seen": 10491664, + "step": 16075 + }, + { + "epoch": 9.481132075471699, + "grad_norm": 1.791094183921814, + "learning_rate": 6.3104890684042055e-06, + "loss": 0.4577, + "num_input_tokens_seen": 10494416, + "step": 16080 + }, + { + "epoch": 9.484080188679245, + "grad_norm": 3.0495352745056152, + "learning_rate": 6.308006119632228e-06, + "loss": 0.3334, + "num_input_tokens_seen": 10496816, + "step": 16085 + }, + { + "epoch": 9.487028301886792, + "grad_norm": 2.028726816177368, + "learning_rate": 6.305522824559993e-06, + "loss": 0.3879, + "num_input_tokens_seen": 10500304, + "step": 16090 + }, + { + "epoch": 9.48997641509434, + "grad_norm": 2.6754910945892334, + "learning_rate": 6.303039183844965e-06, + "loss": 0.4932, + "num_input_tokens_seen": 10502992, + "step": 16095 + }, + { + "epoch": 9.492924528301886, + "grad_norm": 3.8133046627044678, + "learning_rate": 6.300555198144697e-06, + "loss": 0.4101, + "num_input_tokens_seen": 10505200, + "step": 16100 + }, + { + "epoch": 9.495872641509434, + "grad_norm": 2.085482120513916, + "learning_rate": 6.2980708681168335e-06, + "loss": 0.4393, + "num_input_tokens_seen": 10508176, + "step": 16105 + }, + { + "epoch": 9.49882075471698, + "grad_norm": 2.685556411743164, + "learning_rate": 6.2955861944191145e-06, + "loss": 0.3644, + "num_input_tokens_seen": 10510928, + "step": 16110 + }, + { + "epoch": 9.501768867924529, + "grad_norm": 2.5360171794891357, + "learning_rate": 6.293101177709367e-06, + "loss": 0.4075, + "num_input_tokens_seen": 10514000, + "step": 16115 + }, + { + "epoch": 9.504716981132075, + "grad_norm": 4.248354434967041, + "learning_rate": 6.290615818645509e-06, + "loss": 0.3591, + "num_input_tokens_seen": 10517392, + "step": 16120 + }, + { + "epoch": 9.507665094339622, + "grad_norm": 3.3375558853149414, + "learning_rate": 6.288130117885552e-06, + "loss": 0.4013, + "num_input_tokens_seen": 10520688, + "step": 16125 + }, + { + "epoch": 9.51061320754717, + "grad_norm": 2.1879544258117676, + "learning_rate": 6.285644076087594e-06, + "loss": 0.3585, + "num_input_tokens_seen": 10523280, + "step": 16130 + }, + { + "epoch": 9.513561320754716, + "grad_norm": 2.019651412963867, + "learning_rate": 6.283157693909826e-06, + "loss": 0.3922, + "num_input_tokens_seen": 10526928, + "step": 16135 + }, + { + "epoch": 9.516509433962264, + "grad_norm": 4.027740478515625, + "learning_rate": 6.280670972010528e-06, + "loss": 0.2761, + "num_input_tokens_seen": 10529968, + "step": 16140 + }, + { + "epoch": 9.51945754716981, + "grad_norm": 3.4406824111938477, + "learning_rate": 6.278183911048072e-06, + "loss": 0.3571, + "num_input_tokens_seen": 10533200, + "step": 16145 + }, + { + "epoch": 9.522405660377359, + "grad_norm": 3.4908158779144287, + "learning_rate": 6.275696511680915e-06, + "loss": 0.3913, + "num_input_tokens_seen": 10536048, + "step": 16150 + }, + { + "epoch": 9.525353773584905, + "grad_norm": 3.276486396789551, + "learning_rate": 6.27320877456761e-06, + "loss": 0.506, + "num_input_tokens_seen": 10539760, + "step": 16155 + }, + { + "epoch": 9.528301886792454, + "grad_norm": 2.7534854412078857, + "learning_rate": 6.270720700366793e-06, + "loss": 0.6174, + "num_input_tokens_seen": 10544016, + "step": 16160 + }, + { + "epoch": 9.53125, + "grad_norm": 1.7970536947250366, + "learning_rate": 6.2682322897371974e-06, + "loss": 0.4251, + "num_input_tokens_seen": 10547984, + "step": 16165 + }, + { + "epoch": 9.534198113207546, + "grad_norm": 2.959211587905884, + "learning_rate": 6.265743543337634e-06, + "loss": 0.3509, + "num_input_tokens_seen": 10551568, + "step": 16170 + }, + { + "epoch": 9.537146226415095, + "grad_norm": 1.4761197566986084, + "learning_rate": 6.2632544618270156e-06, + "loss": 0.2753, + "num_input_tokens_seen": 10554608, + "step": 16175 + }, + { + "epoch": 9.540094339622641, + "grad_norm": 2.2524514198303223, + "learning_rate": 6.260765045864332e-06, + "loss": 0.4476, + "num_input_tokens_seen": 10558160, + "step": 16180 + }, + { + "epoch": 9.54304245283019, + "grad_norm": 2.325085401535034, + "learning_rate": 6.258275296108669e-06, + "loss": 0.4223, + "num_input_tokens_seen": 10560720, + "step": 16185 + }, + { + "epoch": 9.545990566037736, + "grad_norm": 1.6599845886230469, + "learning_rate": 6.2557852132191985e-06, + "loss": 0.4747, + "num_input_tokens_seen": 10564752, + "step": 16190 + }, + { + "epoch": 9.548938679245284, + "grad_norm": 4.046700954437256, + "learning_rate": 6.253294797855182e-06, + "loss": 0.3604, + "num_input_tokens_seen": 10567600, + "step": 16195 + }, + { + "epoch": 9.55188679245283, + "grad_norm": 2.2707836627960205, + "learning_rate": 6.250804050675964e-06, + "loss": 0.4247, + "num_input_tokens_seen": 10571280, + "step": 16200 + }, + { + "epoch": 9.554834905660378, + "grad_norm": 3.4458892345428467, + "learning_rate": 6.248312972340984e-06, + "loss": 0.4556, + "num_input_tokens_seen": 10574928, + "step": 16205 + }, + { + "epoch": 9.557783018867925, + "grad_norm": 1.3727914094924927, + "learning_rate": 6.2458215635097656e-06, + "loss": 0.5248, + "num_input_tokens_seen": 10578864, + "step": 16210 + }, + { + "epoch": 9.560731132075471, + "grad_norm": 2.6479766368865967, + "learning_rate": 6.2433298248419175e-06, + "loss": 0.3577, + "num_input_tokens_seen": 10581648, + "step": 16215 + }, + { + "epoch": 9.56367924528302, + "grad_norm": 2.6002891063690186, + "learning_rate": 6.2408377569971405e-06, + "loss": 0.3685, + "num_input_tokens_seen": 10585424, + "step": 16220 + }, + { + "epoch": 9.566627358490566, + "grad_norm": 6.889472961425781, + "learning_rate": 6.238345360635221e-06, + "loss": 0.4587, + "num_input_tokens_seen": 10587856, + "step": 16225 + }, + { + "epoch": 9.569575471698114, + "grad_norm": 2.6355085372924805, + "learning_rate": 6.2358526364160274e-06, + "loss": 0.4316, + "num_input_tokens_seen": 10591536, + "step": 16230 + }, + { + "epoch": 9.57252358490566, + "grad_norm": 1.6638799905776978, + "learning_rate": 6.2333595849995245e-06, + "loss": 0.338, + "num_input_tokens_seen": 10594768, + "step": 16235 + }, + { + "epoch": 9.575471698113208, + "grad_norm": 2.500528335571289, + "learning_rate": 6.230866207045756e-06, + "loss": 0.3746, + "num_input_tokens_seen": 10598384, + "step": 16240 + }, + { + "epoch": 9.578419811320755, + "grad_norm": 2.298816204071045, + "learning_rate": 6.228372503214853e-06, + "loss": 0.2456, + "num_input_tokens_seen": 10601776, + "step": 16245 + }, + { + "epoch": 9.581367924528301, + "grad_norm": 3.5345942974090576, + "learning_rate": 6.225878474167035e-06, + "loss": 0.6167, + "num_input_tokens_seen": 10607760, + "step": 16250 + }, + { + "epoch": 9.58431603773585, + "grad_norm": 3.6032612323760986, + "learning_rate": 6.22338412056261e-06, + "loss": 0.6155, + "num_input_tokens_seen": 10610832, + "step": 16255 + }, + { + "epoch": 9.587264150943396, + "grad_norm": 1.9593465328216553, + "learning_rate": 6.220889443061966e-06, + "loss": 0.3425, + "num_input_tokens_seen": 10614192, + "step": 16260 + }, + { + "epoch": 9.590212264150944, + "grad_norm": 5.425792694091797, + "learning_rate": 6.2183944423255796e-06, + "loss": 0.4821, + "num_input_tokens_seen": 10616848, + "step": 16265 + }, + { + "epoch": 9.59316037735849, + "grad_norm": 3.7418839931488037, + "learning_rate": 6.215899119014015e-06, + "loss": 0.3529, + "num_input_tokens_seen": 10620144, + "step": 16270 + }, + { + "epoch": 9.596108490566039, + "grad_norm": 1.9273862838745117, + "learning_rate": 6.2134034737879175e-06, + "loss": 0.3387, + "num_input_tokens_seen": 10622832, + "step": 16275 + }, + { + "epoch": 9.599056603773585, + "grad_norm": 2.1943199634552, + "learning_rate": 6.2109075073080205e-06, + "loss": 0.4255, + "num_input_tokens_seen": 10625552, + "step": 16280 + }, + { + "epoch": 9.602004716981131, + "grad_norm": 2.018352508544922, + "learning_rate": 6.2084112202351425e-06, + "loss": 0.3471, + "num_input_tokens_seen": 10628368, + "step": 16285 + }, + { + "epoch": 9.60495283018868, + "grad_norm": 3.232924222946167, + "learning_rate": 6.205914613230186e-06, + "loss": 0.3244, + "num_input_tokens_seen": 10631920, + "step": 16290 + }, + { + "epoch": 9.607900943396226, + "grad_norm": 1.989395260810852, + "learning_rate": 6.203417686954138e-06, + "loss": 0.3593, + "num_input_tokens_seen": 10635152, + "step": 16295 + }, + { + "epoch": 9.610849056603774, + "grad_norm": 2.1981143951416016, + "learning_rate": 6.2009204420680706e-06, + "loss": 0.3664, + "num_input_tokens_seen": 10638096, + "step": 16300 + }, + { + "epoch": 9.61379716981132, + "grad_norm": 2.201779365539551, + "learning_rate": 6.198422879233141e-06, + "loss": 0.4008, + "num_input_tokens_seen": 10641136, + "step": 16305 + }, + { + "epoch": 9.616745283018869, + "grad_norm": 2.6620354652404785, + "learning_rate": 6.1959249991105895e-06, + "loss": 0.4521, + "num_input_tokens_seen": 10644176, + "step": 16310 + }, + { + "epoch": 9.619693396226415, + "grad_norm": 1.9121938943862915, + "learning_rate": 6.19342680236174e-06, + "loss": 0.3356, + "num_input_tokens_seen": 10646608, + "step": 16315 + }, + { + "epoch": 9.622641509433961, + "grad_norm": 3.327310562133789, + "learning_rate": 6.190928289648003e-06, + "loss": 0.4796, + "num_input_tokens_seen": 10650672, + "step": 16320 + }, + { + "epoch": 9.62558962264151, + "grad_norm": 2.184406042098999, + "learning_rate": 6.188429461630866e-06, + "loss": 0.3043, + "num_input_tokens_seen": 10653680, + "step": 16325 + }, + { + "epoch": 9.628537735849056, + "grad_norm": 4.023909091949463, + "learning_rate": 6.185930318971909e-06, + "loss": 0.3331, + "num_input_tokens_seen": 10656976, + "step": 16330 + }, + { + "epoch": 9.631485849056604, + "grad_norm": 2.4213967323303223, + "learning_rate": 6.1834308623327885e-06, + "loss": 0.441, + "num_input_tokens_seen": 10659664, + "step": 16335 + }, + { + "epoch": 9.63443396226415, + "grad_norm": 11.27309513092041, + "learning_rate": 6.180931092375247e-06, + "loss": 0.3928, + "num_input_tokens_seen": 10663792, + "step": 16340 + }, + { + "epoch": 9.637382075471699, + "grad_norm": 3.23309326171875, + "learning_rate": 6.1784310097611075e-06, + "loss": 0.3496, + "num_input_tokens_seen": 10666160, + "step": 16345 + }, + { + "epoch": 9.640330188679245, + "grad_norm": 2.8784143924713135, + "learning_rate": 6.1759306151522815e-06, + "loss": 0.345, + "num_input_tokens_seen": 10668976, + "step": 16350 + }, + { + "epoch": 9.643278301886792, + "grad_norm": 3.293720245361328, + "learning_rate": 6.173429909210755e-06, + "loss": 0.2711, + "num_input_tokens_seen": 10672784, + "step": 16355 + }, + { + "epoch": 9.64622641509434, + "grad_norm": 2.3329575061798096, + "learning_rate": 6.170928892598606e-06, + "loss": 0.4054, + "num_input_tokens_seen": 10676336, + "step": 16360 + }, + { + "epoch": 9.649174528301886, + "grad_norm": 1.9740569591522217, + "learning_rate": 6.168427565977984e-06, + "loss": 0.3172, + "num_input_tokens_seen": 10679344, + "step": 16365 + }, + { + "epoch": 9.652122641509434, + "grad_norm": 2.0854477882385254, + "learning_rate": 6.165925930011129e-06, + "loss": 0.3845, + "num_input_tokens_seen": 10682416, + "step": 16370 + }, + { + "epoch": 9.65507075471698, + "grad_norm": 2.4912619590759277, + "learning_rate": 6.163423985360359e-06, + "loss": 0.4313, + "num_input_tokens_seen": 10685616, + "step": 16375 + }, + { + "epoch": 9.658018867924529, + "grad_norm": 1.7878752946853638, + "learning_rate": 6.160921732688076e-06, + "loss": 0.3738, + "num_input_tokens_seen": 10688848, + "step": 16380 + }, + { + "epoch": 9.660966981132075, + "grad_norm": 2.225677013397217, + "learning_rate": 6.158419172656759e-06, + "loss": 0.5055, + "num_input_tokens_seen": 10692048, + "step": 16385 + }, + { + "epoch": 9.663915094339622, + "grad_norm": 3.603638172149658, + "learning_rate": 6.155916305928974e-06, + "loss": 0.4606, + "num_input_tokens_seen": 10695472, + "step": 16390 + }, + { + "epoch": 9.66686320754717, + "grad_norm": 2.2072694301605225, + "learning_rate": 6.153413133167366e-06, + "loss": 0.3599, + "num_input_tokens_seen": 10698768, + "step": 16395 + }, + { + "epoch": 9.669811320754716, + "grad_norm": 2.3150978088378906, + "learning_rate": 6.1509096550346596e-06, + "loss": 0.3355, + "num_input_tokens_seen": 10702288, + "step": 16400 + }, + { + "epoch": 9.672759433962264, + "grad_norm": 1.9942560195922852, + "learning_rate": 6.148405872193661e-06, + "loss": 0.4104, + "num_input_tokens_seen": 10706096, + "step": 16405 + }, + { + "epoch": 9.67570754716981, + "grad_norm": 1.9642401933670044, + "learning_rate": 6.14590178530726e-06, + "loss": 0.433, + "num_input_tokens_seen": 10709392, + "step": 16410 + }, + { + "epoch": 9.678655660377359, + "grad_norm": 2.665734052658081, + "learning_rate": 6.143397395038422e-06, + "loss": 0.3584, + "num_input_tokens_seen": 10713360, + "step": 16415 + }, + { + "epoch": 9.681603773584905, + "grad_norm": 2.785623550415039, + "learning_rate": 6.140892702050196e-06, + "loss": 0.4353, + "num_input_tokens_seen": 10716464, + "step": 16420 + }, + { + "epoch": 9.684551886792454, + "grad_norm": 3.070402145385742, + "learning_rate": 6.138387707005711e-06, + "loss": 0.4035, + "num_input_tokens_seen": 10719824, + "step": 16425 + }, + { + "epoch": 9.6875, + "grad_norm": 2.493583917617798, + "learning_rate": 6.135882410568172e-06, + "loss": 0.4111, + "num_input_tokens_seen": 10722768, + "step": 16430 + }, + { + "epoch": 9.690448113207546, + "grad_norm": 2.19046688079834, + "learning_rate": 6.133376813400872e-06, + "loss": 0.4062, + "num_input_tokens_seen": 10724944, + "step": 16435 + }, + { + "epoch": 9.693396226415095, + "grad_norm": 2.1911120414733887, + "learning_rate": 6.130870916167175e-06, + "loss": 0.4833, + "num_input_tokens_seen": 10728048, + "step": 16440 + }, + { + "epoch": 9.696344339622641, + "grad_norm": 1.8458647727966309, + "learning_rate": 6.128364719530528e-06, + "loss": 0.5316, + "num_input_tokens_seen": 10730768, + "step": 16445 + }, + { + "epoch": 9.69929245283019, + "grad_norm": 2.6938154697418213, + "learning_rate": 6.125858224154459e-06, + "loss": 0.4095, + "num_input_tokens_seen": 10734000, + "step": 16450 + }, + { + "epoch": 9.702240566037736, + "grad_norm": 3.6339621543884277, + "learning_rate": 6.123351430702576e-06, + "loss": 0.3504, + "num_input_tokens_seen": 10736592, + "step": 16455 + }, + { + "epoch": 9.705188679245284, + "grad_norm": 1.8029247522354126, + "learning_rate": 6.1208443398385575e-06, + "loss": 0.294, + "num_input_tokens_seen": 10739888, + "step": 16460 + }, + { + "epoch": 9.70813679245283, + "grad_norm": 1.8889799118041992, + "learning_rate": 6.118336952226169e-06, + "loss": 0.5239, + "num_input_tokens_seen": 10743984, + "step": 16465 + }, + { + "epoch": 9.711084905660378, + "grad_norm": 2.1132171154022217, + "learning_rate": 6.115829268529254e-06, + "loss": 0.3339, + "num_input_tokens_seen": 10747792, + "step": 16470 + }, + { + "epoch": 9.714033018867925, + "grad_norm": 3.6159753799438477, + "learning_rate": 6.1133212894117326e-06, + "loss": 0.3415, + "num_input_tokens_seen": 10751152, + "step": 16475 + }, + { + "epoch": 9.716981132075471, + "grad_norm": 3.731701374053955, + "learning_rate": 6.1108130155375986e-06, + "loss": 0.4803, + "num_input_tokens_seen": 10754192, + "step": 16480 + }, + { + "epoch": 9.71992924528302, + "grad_norm": 1.4406051635742188, + "learning_rate": 6.108304447570933e-06, + "loss": 0.3669, + "num_input_tokens_seen": 10757584, + "step": 16485 + }, + { + "epoch": 9.722877358490566, + "grad_norm": 2.607942581176758, + "learning_rate": 6.105795586175888e-06, + "loss": 0.454, + "num_input_tokens_seen": 10760816, + "step": 16490 + }, + { + "epoch": 9.725825471698114, + "grad_norm": 3.76261830329895, + "learning_rate": 6.1032864320166954e-06, + "loss": 0.4062, + "num_input_tokens_seen": 10764880, + "step": 16495 + }, + { + "epoch": 9.72877358490566, + "grad_norm": 3.5371530055999756, + "learning_rate": 6.100776985757666e-06, + "loss": 0.358, + "num_input_tokens_seen": 10767632, + "step": 16500 + }, + { + "epoch": 9.731721698113208, + "grad_norm": 2.4419028759002686, + "learning_rate": 6.098267248063186e-06, + "loss": 0.4597, + "num_input_tokens_seen": 10770288, + "step": 16505 + }, + { + "epoch": 9.734669811320755, + "grad_norm": 5.362765789031982, + "learning_rate": 6.0957572195977165e-06, + "loss": 0.4452, + "num_input_tokens_seen": 10773296, + "step": 16510 + }, + { + "epoch": 9.737617924528301, + "grad_norm": 3.4774794578552246, + "learning_rate": 6.0932469010258025e-06, + "loss": 0.4105, + "num_input_tokens_seen": 10776048, + "step": 16515 + }, + { + "epoch": 9.74056603773585, + "grad_norm": 3.1349384784698486, + "learning_rate": 6.0907362930120594e-06, + "loss": 0.3266, + "num_input_tokens_seen": 10780080, + "step": 16520 + }, + { + "epoch": 9.743514150943396, + "grad_norm": 3.8733880519866943, + "learning_rate": 6.088225396221181e-06, + "loss": 0.3753, + "num_input_tokens_seen": 10783344, + "step": 16525 + }, + { + "epoch": 9.746462264150944, + "grad_norm": 2.1874942779541016, + "learning_rate": 6.0857142113179415e-06, + "loss": 0.3171, + "num_input_tokens_seen": 10786000, + "step": 16530 + }, + { + "epoch": 9.74941037735849, + "grad_norm": 2.977844715118408, + "learning_rate": 6.083202738967182e-06, + "loss": 0.4202, + "num_input_tokens_seen": 10789744, + "step": 16535 + }, + { + "epoch": 9.752358490566039, + "grad_norm": 2.73738956451416, + "learning_rate": 6.0806909798338324e-06, + "loss": 0.3832, + "num_input_tokens_seen": 10793168, + "step": 16540 + }, + { + "epoch": 9.755306603773585, + "grad_norm": 1.7961540222167969, + "learning_rate": 6.0781789345828854e-06, + "loss": 0.3651, + "num_input_tokens_seen": 10796016, + "step": 16545 + }, + { + "epoch": 9.758254716981131, + "grad_norm": 3.155907392501831, + "learning_rate": 6.0756666038794195e-06, + "loss": 0.4898, + "num_input_tokens_seen": 10798928, + "step": 16550 + }, + { + "epoch": 9.76120283018868, + "grad_norm": 1.6378077268600464, + "learning_rate": 6.073153988388586e-06, + "loss": 0.3439, + "num_input_tokens_seen": 10802416, + "step": 16555 + }, + { + "epoch": 9.764150943396226, + "grad_norm": 2.97456693649292, + "learning_rate": 6.070641088775608e-06, + "loss": 0.3928, + "num_input_tokens_seen": 10805104, + "step": 16560 + }, + { + "epoch": 9.767099056603774, + "grad_norm": 2.215664863586426, + "learning_rate": 6.068127905705787e-06, + "loss": 0.4216, + "num_input_tokens_seen": 10808496, + "step": 16565 + }, + { + "epoch": 9.77004716981132, + "grad_norm": 2.191291570663452, + "learning_rate": 6.065614439844501e-06, + "loss": 0.4061, + "num_input_tokens_seen": 10811632, + "step": 16570 + }, + { + "epoch": 9.772995283018869, + "grad_norm": 2.213578224182129, + "learning_rate": 6.063100691857198e-06, + "loss": 0.4434, + "num_input_tokens_seen": 10814640, + "step": 16575 + }, + { + "epoch": 9.775943396226415, + "grad_norm": 4.050931930541992, + "learning_rate": 6.060586662409407e-06, + "loss": 0.502, + "num_input_tokens_seen": 10817840, + "step": 16580 + }, + { + "epoch": 9.778891509433961, + "grad_norm": 1.9550971984863281, + "learning_rate": 6.058072352166724e-06, + "loss": 0.4667, + "num_input_tokens_seen": 10820944, + "step": 16585 + }, + { + "epoch": 9.78183962264151, + "grad_norm": 2.4437146186828613, + "learning_rate": 6.055557761794826e-06, + "loss": 0.3563, + "num_input_tokens_seen": 10823760, + "step": 16590 + }, + { + "epoch": 9.784787735849056, + "grad_norm": 2.827178955078125, + "learning_rate": 6.053042891959462e-06, + "loss": 0.3519, + "num_input_tokens_seen": 10826224, + "step": 16595 + }, + { + "epoch": 9.787735849056604, + "grad_norm": 4.610492706298828, + "learning_rate": 6.050527743326455e-06, + "loss": 0.3644, + "num_input_tokens_seen": 10830288, + "step": 16600 + }, + { + "epoch": 9.79068396226415, + "grad_norm": 3.2514307498931885, + "learning_rate": 6.048012316561699e-06, + "loss": 0.3833, + "num_input_tokens_seen": 10833392, + "step": 16605 + }, + { + "epoch": 9.793632075471699, + "grad_norm": 2.8535454273223877, + "learning_rate": 6.045496612331166e-06, + "loss": 0.377, + "num_input_tokens_seen": 10836688, + "step": 16610 + }, + { + "epoch": 9.796580188679245, + "grad_norm": 2.023005485534668, + "learning_rate": 6.0429806313009e-06, + "loss": 0.5257, + "num_input_tokens_seen": 10839248, + "step": 16615 + }, + { + "epoch": 9.799528301886792, + "grad_norm": 2.504904270172119, + "learning_rate": 6.040464374137015e-06, + "loss": 0.2781, + "num_input_tokens_seen": 10842416, + "step": 16620 + }, + { + "epoch": 9.80247641509434, + "grad_norm": 2.58198618888855, + "learning_rate": 6.0379478415057045e-06, + "loss": 0.3882, + "num_input_tokens_seen": 10845136, + "step": 16625 + }, + { + "epoch": 9.805424528301886, + "grad_norm": 1.8210455179214478, + "learning_rate": 6.035431034073228e-06, + "loss": 0.2995, + "num_input_tokens_seen": 10848272, + "step": 16630 + }, + { + "epoch": 9.808372641509434, + "grad_norm": 2.355684757232666, + "learning_rate": 6.0329139525059235e-06, + "loss": 0.3818, + "num_input_tokens_seen": 10850992, + "step": 16635 + }, + { + "epoch": 9.81132075471698, + "grad_norm": 1.702268362045288, + "learning_rate": 6.030396597470198e-06, + "loss": 0.4443, + "num_input_tokens_seen": 10854096, + "step": 16640 + }, + { + "epoch": 9.814268867924529, + "grad_norm": 2.5137674808502197, + "learning_rate": 6.027878969632534e-06, + "loss": 0.3796, + "num_input_tokens_seen": 10857936, + "step": 16645 + }, + { + "epoch": 9.817216981132075, + "grad_norm": 2.0676541328430176, + "learning_rate": 6.025361069659482e-06, + "loss": 0.4803, + "num_input_tokens_seen": 10860784, + "step": 16650 + }, + { + "epoch": 9.820165094339622, + "grad_norm": 2.474544048309326, + "learning_rate": 6.022842898217668e-06, + "loss": 0.3006, + "num_input_tokens_seen": 10863888, + "step": 16655 + }, + { + "epoch": 9.82311320754717, + "grad_norm": 1.9515637159347534, + "learning_rate": 6.020324455973788e-06, + "loss": 0.4809, + "num_input_tokens_seen": 10867632, + "step": 16660 + }, + { + "epoch": 9.826061320754716, + "grad_norm": 1.8432711362838745, + "learning_rate": 6.017805743594612e-06, + "loss": 0.3865, + "num_input_tokens_seen": 10870192, + "step": 16665 + }, + { + "epoch": 9.829009433962264, + "grad_norm": 2.3254454135894775, + "learning_rate": 6.0152867617469776e-06, + "loss": 0.3491, + "num_input_tokens_seen": 10873264, + "step": 16670 + }, + { + "epoch": 9.83195754716981, + "grad_norm": 1.9653053283691406, + "learning_rate": 6.012767511097799e-06, + "loss": 0.3091, + "num_input_tokens_seen": 10876656, + "step": 16675 + }, + { + "epoch": 9.834905660377359, + "grad_norm": 5.938626766204834, + "learning_rate": 6.010247992314055e-06, + "loss": 0.4253, + "num_input_tokens_seen": 10879824, + "step": 16680 + }, + { + "epoch": 9.837853773584905, + "grad_norm": 6.53312873840332, + "learning_rate": 6.007728206062802e-06, + "loss": 0.3466, + "num_input_tokens_seen": 10882480, + "step": 16685 + }, + { + "epoch": 9.840801886792454, + "grad_norm": 3.500185251235962, + "learning_rate": 6.005208153011163e-06, + "loss": 0.4112, + "num_input_tokens_seen": 10885648, + "step": 16690 + }, + { + "epoch": 9.84375, + "grad_norm": 2.181725025177002, + "learning_rate": 6.0026878338263335e-06, + "loss": 0.3063, + "num_input_tokens_seen": 10888880, + "step": 16695 + }, + { + "epoch": 9.846698113207546, + "grad_norm": 6.721580505371094, + "learning_rate": 6.000167249175579e-06, + "loss": 0.4255, + "num_input_tokens_seen": 10892304, + "step": 16700 + }, + { + "epoch": 9.849646226415095, + "grad_norm": 4.082174301147461, + "learning_rate": 5.997646399726236e-06, + "loss": 0.5917, + "num_input_tokens_seen": 10894960, + "step": 16705 + }, + { + "epoch": 9.852594339622641, + "grad_norm": 2.4891750812530518, + "learning_rate": 5.995125286145707e-06, + "loss": 0.3054, + "num_input_tokens_seen": 10898352, + "step": 16710 + }, + { + "epoch": 9.85554245283019, + "grad_norm": 2.6735830307006836, + "learning_rate": 5.99260390910147e-06, + "loss": 0.4061, + "num_input_tokens_seen": 10901456, + "step": 16715 + }, + { + "epoch": 9.858490566037736, + "grad_norm": 1.327249526977539, + "learning_rate": 5.990082269261071e-06, + "loss": 0.3434, + "num_input_tokens_seen": 10904560, + "step": 16720 + }, + { + "epoch": 9.861438679245284, + "grad_norm": 2.5125489234924316, + "learning_rate": 5.987560367292123e-06, + "loss": 0.4097, + "num_input_tokens_seen": 10908080, + "step": 16725 + }, + { + "epoch": 9.86438679245283, + "grad_norm": 1.7410204410552979, + "learning_rate": 5.985038203862313e-06, + "loss": 0.3271, + "num_input_tokens_seen": 10911056, + "step": 16730 + }, + { + "epoch": 9.867334905660378, + "grad_norm": 1.4541611671447754, + "learning_rate": 5.982515779639393e-06, + "loss": 0.2912, + "num_input_tokens_seen": 10914384, + "step": 16735 + }, + { + "epoch": 9.870283018867925, + "grad_norm": 2.2535011768341064, + "learning_rate": 5.979993095291186e-06, + "loss": 0.4713, + "num_input_tokens_seen": 10918000, + "step": 16740 + }, + { + "epoch": 9.873231132075471, + "grad_norm": 1.45979905128479, + "learning_rate": 5.977470151485582e-06, + "loss": 0.2117, + "num_input_tokens_seen": 10921104, + "step": 16745 + }, + { + "epoch": 9.87617924528302, + "grad_norm": 2.174992322921753, + "learning_rate": 5.974946948890544e-06, + "loss": 0.328, + "num_input_tokens_seen": 10924848, + "step": 16750 + }, + { + "epoch": 9.879127358490566, + "grad_norm": 1.6254414319992065, + "learning_rate": 5.9724234881740994e-06, + "loss": 0.411, + "num_input_tokens_seen": 10928208, + "step": 16755 + }, + { + "epoch": 9.882075471698114, + "grad_norm": 1.5500922203063965, + "learning_rate": 5.9698997700043445e-06, + "loss": 0.334, + "num_input_tokens_seen": 10931920, + "step": 16760 + }, + { + "epoch": 9.88502358490566, + "grad_norm": 2.435666799545288, + "learning_rate": 5.9673757950494475e-06, + "loss": 0.2803, + "num_input_tokens_seen": 10936016, + "step": 16765 + }, + { + "epoch": 9.887971698113208, + "grad_norm": 3.388235330581665, + "learning_rate": 5.964851563977639e-06, + "loss": 0.4116, + "num_input_tokens_seen": 10938224, + "step": 16770 + }, + { + "epoch": 9.890919811320755, + "grad_norm": 3.7855443954467773, + "learning_rate": 5.962327077457219e-06, + "loss": 0.2549, + "num_input_tokens_seen": 10940816, + "step": 16775 + }, + { + "epoch": 9.893867924528301, + "grad_norm": 3.4862096309661865, + "learning_rate": 5.959802336156558e-06, + "loss": 0.4353, + "num_input_tokens_seen": 10943536, + "step": 16780 + }, + { + "epoch": 9.89681603773585, + "grad_norm": 1.2363417148590088, + "learning_rate": 5.957277340744094e-06, + "loss": 0.367, + "num_input_tokens_seen": 10946480, + "step": 16785 + }, + { + "epoch": 9.899764150943396, + "grad_norm": 3.063689708709717, + "learning_rate": 5.954752091888326e-06, + "loss": 0.5593, + "num_input_tokens_seen": 10949968, + "step": 16790 + }, + { + "epoch": 9.902712264150944, + "grad_norm": 2.1563096046447754, + "learning_rate": 5.952226590257829e-06, + "loss": 0.5114, + "num_input_tokens_seen": 10952656, + "step": 16795 + }, + { + "epoch": 9.90566037735849, + "grad_norm": 3.8924951553344727, + "learning_rate": 5.949700836521239e-06, + "loss": 0.3066, + "num_input_tokens_seen": 10954960, + "step": 16800 + }, + { + "epoch": 9.908608490566039, + "grad_norm": 2.554572820663452, + "learning_rate": 5.947174831347257e-06, + "loss": 0.3967, + "num_input_tokens_seen": 10958256, + "step": 16805 + }, + { + "epoch": 9.911556603773585, + "grad_norm": 1.4245203733444214, + "learning_rate": 5.944648575404657e-06, + "loss": 0.4072, + "num_input_tokens_seen": 10963056, + "step": 16810 + }, + { + "epoch": 9.914504716981131, + "grad_norm": 3.73224139213562, + "learning_rate": 5.942122069362276e-06, + "loss": 0.4447, + "num_input_tokens_seen": 10966384, + "step": 16815 + }, + { + "epoch": 9.91745283018868, + "grad_norm": 3.2709403038024902, + "learning_rate": 5.939595313889016e-06, + "loss": 0.4194, + "num_input_tokens_seen": 10969680, + "step": 16820 + }, + { + "epoch": 9.920400943396226, + "grad_norm": 2.644888162612915, + "learning_rate": 5.937068309653848e-06, + "loss": 0.3625, + "num_input_tokens_seen": 10973360, + "step": 16825 + }, + { + "epoch": 9.923349056603774, + "grad_norm": 4.134654998779297, + "learning_rate": 5.934541057325807e-06, + "loss": 0.3913, + "num_input_tokens_seen": 10976976, + "step": 16830 + }, + { + "epoch": 9.92629716981132, + "grad_norm": 1.81215238571167, + "learning_rate": 5.932013557573992e-06, + "loss": 0.4516, + "num_input_tokens_seen": 10980560, + "step": 16835 + }, + { + "epoch": 9.929245283018869, + "grad_norm": 4.7528300285339355, + "learning_rate": 5.929485811067572e-06, + "loss": 0.3088, + "num_input_tokens_seen": 10984080, + "step": 16840 + }, + { + "epoch": 9.932193396226415, + "grad_norm": 2.3289639949798584, + "learning_rate": 5.926957818475778e-06, + "loss": 0.346, + "num_input_tokens_seen": 10987056, + "step": 16845 + }, + { + "epoch": 9.935141509433961, + "grad_norm": 3.151592969894409, + "learning_rate": 5.924429580467905e-06, + "loss": 0.3832, + "num_input_tokens_seen": 10990480, + "step": 16850 + }, + { + "epoch": 9.93808962264151, + "grad_norm": 2.5649797916412354, + "learning_rate": 5.921901097713317e-06, + "loss": 0.3044, + "num_input_tokens_seen": 10993520, + "step": 16855 + }, + { + "epoch": 9.941037735849056, + "grad_norm": 3.349287271499634, + "learning_rate": 5.919372370881442e-06, + "loss": 0.2404, + "num_input_tokens_seen": 10996208, + "step": 16860 + }, + { + "epoch": 9.943985849056604, + "grad_norm": 2.22196364402771, + "learning_rate": 5.916843400641768e-06, + "loss": 0.413, + "num_input_tokens_seen": 11000016, + "step": 16865 + }, + { + "epoch": 9.94693396226415, + "grad_norm": 2.7622289657592773, + "learning_rate": 5.914314187663851e-06, + "loss": 0.3892, + "num_input_tokens_seen": 11003024, + "step": 16870 + }, + { + "epoch": 9.949882075471699, + "grad_norm": 3.572880506515503, + "learning_rate": 5.911784732617314e-06, + "loss": 0.3314, + "num_input_tokens_seen": 11005168, + "step": 16875 + }, + { + "epoch": 9.952830188679245, + "grad_norm": 2.1173453330993652, + "learning_rate": 5.90925503617184e-06, + "loss": 0.3725, + "num_input_tokens_seen": 11008336, + "step": 16880 + }, + { + "epoch": 9.955778301886792, + "grad_norm": 4.31459903717041, + "learning_rate": 5.9067250989971745e-06, + "loss": 0.4573, + "num_input_tokens_seen": 11011408, + "step": 16885 + }, + { + "epoch": 9.95872641509434, + "grad_norm": 2.175584554672241, + "learning_rate": 5.904194921763133e-06, + "loss": 0.2939, + "num_input_tokens_seen": 11014928, + "step": 16890 + }, + { + "epoch": 9.961674528301886, + "grad_norm": 2.476794958114624, + "learning_rate": 5.901664505139589e-06, + "loss": 0.3057, + "num_input_tokens_seen": 11018096, + "step": 16895 + }, + { + "epoch": 9.964622641509434, + "grad_norm": 6.294628143310547, + "learning_rate": 5.8991338497964814e-06, + "loss": 0.5172, + "num_input_tokens_seen": 11021200, + "step": 16900 + }, + { + "epoch": 9.96757075471698, + "grad_norm": 2.1673519611358643, + "learning_rate": 5.896602956403812e-06, + "loss": 0.3481, + "num_input_tokens_seen": 11027280, + "step": 16905 + }, + { + "epoch": 9.970518867924529, + "grad_norm": 4.833362102508545, + "learning_rate": 5.894071825631645e-06, + "loss": 0.3502, + "num_input_tokens_seen": 11030608, + "step": 16910 + }, + { + "epoch": 9.973466981132075, + "grad_norm": 2.756204128265381, + "learning_rate": 5.891540458150109e-06, + "loss": 0.4036, + "num_input_tokens_seen": 11035056, + "step": 16915 + }, + { + "epoch": 9.976415094339622, + "grad_norm": 2.4853250980377197, + "learning_rate": 5.889008854629395e-06, + "loss": 0.5076, + "num_input_tokens_seen": 11038352, + "step": 16920 + }, + { + "epoch": 9.97936320754717, + "grad_norm": 3.54835844039917, + "learning_rate": 5.886477015739754e-06, + "loss": 0.5762, + "num_input_tokens_seen": 11040912, + "step": 16925 + }, + { + "epoch": 9.982311320754716, + "grad_norm": 2.323976516723633, + "learning_rate": 5.883944942151502e-06, + "loss": 0.3954, + "num_input_tokens_seen": 11044528, + "step": 16930 + }, + { + "epoch": 9.985259433962264, + "grad_norm": 3.151970386505127, + "learning_rate": 5.88141263453502e-06, + "loss": 0.3634, + "num_input_tokens_seen": 11047152, + "step": 16935 + }, + { + "epoch": 9.98820754716981, + "grad_norm": 5.0896124839782715, + "learning_rate": 5.878880093560744e-06, + "loss": 0.2998, + "num_input_tokens_seen": 11050544, + "step": 16940 + }, + { + "epoch": 9.991155660377359, + "grad_norm": 3.026899576187134, + "learning_rate": 5.876347319899173e-06, + "loss": 0.4117, + "num_input_tokens_seen": 11053040, + "step": 16945 + }, + { + "epoch": 9.994103773584905, + "grad_norm": 2.2079579830169678, + "learning_rate": 5.873814314220874e-06, + "loss": 0.5502, + "num_input_tokens_seen": 11056656, + "step": 16950 + }, + { + "epoch": 9.997051886792454, + "grad_norm": 1.888657808303833, + "learning_rate": 5.871281077196469e-06, + "loss": 0.3848, + "num_input_tokens_seen": 11059440, + "step": 16955 + }, + { + "epoch": 10.0, + "grad_norm": 7.1668782234191895, + "learning_rate": 5.868747609496643e-06, + "loss": 0.4427, + "num_input_tokens_seen": 11063280, + "step": 16960 + }, + { + "epoch": 10.0, + "eval_loss": 0.52943354845047, + "eval_runtime": 19.2396, + "eval_samples_per_second": 88.152, + "eval_steps_per_second": 22.038, + "num_input_tokens_seen": 11063280, + "step": 16960 + }, + { + "epoch": 10.002948113207546, + "grad_norm": 2.236830234527588, + "learning_rate": 5.8662139117921435e-06, + "loss": 0.3382, + "num_input_tokens_seen": 11066480, + "step": 16965 + }, + { + "epoch": 10.005896226415095, + "grad_norm": 2.191300630569458, + "learning_rate": 5.8636799847537785e-06, + "loss": 0.3696, + "num_input_tokens_seen": 11070448, + "step": 16970 + }, + { + "epoch": 10.008844339622641, + "grad_norm": 2.762096643447876, + "learning_rate": 5.861145829052415e-06, + "loss": 0.3161, + "num_input_tokens_seen": 11073232, + "step": 16975 + }, + { + "epoch": 10.01179245283019, + "grad_norm": 2.7102532386779785, + "learning_rate": 5.858611445358982e-06, + "loss": 0.201, + "num_input_tokens_seen": 11076016, + "step": 16980 + }, + { + "epoch": 10.014740566037736, + "grad_norm": 2.283935546875, + "learning_rate": 5.856076834344468e-06, + "loss": 0.3956, + "num_input_tokens_seen": 11079760, + "step": 16985 + }, + { + "epoch": 10.017688679245284, + "grad_norm": 3.5570545196533203, + "learning_rate": 5.853541996679924e-06, + "loss": 0.4209, + "num_input_tokens_seen": 11082608, + "step": 16990 + }, + { + "epoch": 10.02063679245283, + "grad_norm": 1.843102216720581, + "learning_rate": 5.851006933036456e-06, + "loss": 0.3841, + "num_input_tokens_seen": 11085616, + "step": 16995 + }, + { + "epoch": 10.023584905660377, + "grad_norm": 1.6436667442321777, + "learning_rate": 5.848471644085236e-06, + "loss": 0.433, + "num_input_tokens_seen": 11090096, + "step": 17000 + }, + { + "epoch": 10.026533018867925, + "grad_norm": 5.827091217041016, + "learning_rate": 5.84593613049749e-06, + "loss": 0.3709, + "num_input_tokens_seen": 11092848, + "step": 17005 + }, + { + "epoch": 10.029481132075471, + "grad_norm": 4.524348735809326, + "learning_rate": 5.843400392944509e-06, + "loss": 0.4307, + "num_input_tokens_seen": 11095792, + "step": 17010 + }, + { + "epoch": 10.03242924528302, + "grad_norm": 1.5599381923675537, + "learning_rate": 5.840864432097639e-06, + "loss": 0.4455, + "num_input_tokens_seen": 11100528, + "step": 17015 + }, + { + "epoch": 10.035377358490566, + "grad_norm": 3.4543261528015137, + "learning_rate": 5.838328248628286e-06, + "loss": 0.3511, + "num_input_tokens_seen": 11103824, + "step": 17020 + }, + { + "epoch": 10.038325471698114, + "grad_norm": 2.0398054122924805, + "learning_rate": 5.835791843207916e-06, + "loss": 0.4317, + "num_input_tokens_seen": 11107184, + "step": 17025 + }, + { + "epoch": 10.04127358490566, + "grad_norm": 3.838715076446533, + "learning_rate": 5.833255216508056e-06, + "loss": 0.4389, + "num_input_tokens_seen": 11114512, + "step": 17030 + }, + { + "epoch": 10.044221698113208, + "grad_norm": 2.4425644874572754, + "learning_rate": 5.830718369200284e-06, + "loss": 0.4194, + "num_input_tokens_seen": 11116912, + "step": 17035 + }, + { + "epoch": 10.047169811320755, + "grad_norm": 2.2100231647491455, + "learning_rate": 5.828181301956244e-06, + "loss": 0.3677, + "num_input_tokens_seen": 11120176, + "step": 17040 + }, + { + "epoch": 10.050117924528301, + "grad_norm": 3.5590505599975586, + "learning_rate": 5.825644015447636e-06, + "loss": 0.3028, + "num_input_tokens_seen": 11122896, + "step": 17045 + }, + { + "epoch": 10.05306603773585, + "grad_norm": 3.900620222091675, + "learning_rate": 5.823106510346216e-06, + "loss": 0.436, + "num_input_tokens_seen": 11126256, + "step": 17050 + }, + { + "epoch": 10.056014150943396, + "grad_norm": 2.0913264751434326, + "learning_rate": 5.820568787323798e-06, + "loss": 0.3686, + "num_input_tokens_seen": 11129040, + "step": 17055 + }, + { + "epoch": 10.058962264150944, + "grad_norm": 5.313127040863037, + "learning_rate": 5.818030847052258e-06, + "loss": 0.3068, + "num_input_tokens_seen": 11132016, + "step": 17060 + }, + { + "epoch": 10.06191037735849, + "grad_norm": 2.636955499649048, + "learning_rate": 5.815492690203528e-06, + "loss": 0.3379, + "num_input_tokens_seen": 11134864, + "step": 17065 + }, + { + "epoch": 10.064858490566039, + "grad_norm": 2.718418836593628, + "learning_rate": 5.812954317449591e-06, + "loss": 0.609, + "num_input_tokens_seen": 11138032, + "step": 17070 + }, + { + "epoch": 10.067806603773585, + "grad_norm": 2.9297893047332764, + "learning_rate": 5.810415729462496e-06, + "loss": 0.3786, + "num_input_tokens_seen": 11140944, + "step": 17075 + }, + { + "epoch": 10.070754716981131, + "grad_norm": 3.6357223987579346, + "learning_rate": 5.807876926914344e-06, + "loss": 0.3007, + "num_input_tokens_seen": 11144496, + "step": 17080 + }, + { + "epoch": 10.07370283018868, + "grad_norm": 3.955145835876465, + "learning_rate": 5.8053379104772935e-06, + "loss": 0.3227, + "num_input_tokens_seen": 11147440, + "step": 17085 + }, + { + "epoch": 10.076650943396226, + "grad_norm": 4.106929302215576, + "learning_rate": 5.802798680823562e-06, + "loss": 0.416, + "num_input_tokens_seen": 11150032, + "step": 17090 + }, + { + "epoch": 10.079599056603774, + "grad_norm": 2.101675033569336, + "learning_rate": 5.8002592386254186e-06, + "loss": 0.3803, + "num_input_tokens_seen": 11152912, + "step": 17095 + }, + { + "epoch": 10.08254716981132, + "grad_norm": 2.315084457397461, + "learning_rate": 5.797719584555192e-06, + "loss": 0.3678, + "num_input_tokens_seen": 11155920, + "step": 17100 + }, + { + "epoch": 10.085495283018869, + "grad_norm": 5.475011348724365, + "learning_rate": 5.795179719285269e-06, + "loss": 0.4083, + "num_input_tokens_seen": 11158896, + "step": 17105 + }, + { + "epoch": 10.088443396226415, + "grad_norm": 2.77616548538208, + "learning_rate": 5.792639643488086e-06, + "loss": 0.3617, + "num_input_tokens_seen": 11161424, + "step": 17110 + }, + { + "epoch": 10.091391509433961, + "grad_norm": 4.017038822174072, + "learning_rate": 5.7900993578361434e-06, + "loss": 0.3653, + "num_input_tokens_seen": 11164304, + "step": 17115 + }, + { + "epoch": 10.09433962264151, + "grad_norm": 1.4543569087982178, + "learning_rate": 5.7875588630019895e-06, + "loss": 0.3009, + "num_input_tokens_seen": 11168048, + "step": 17120 + }, + { + "epoch": 10.097287735849056, + "grad_norm": 3.10024094581604, + "learning_rate": 5.7850181596582335e-06, + "loss": 0.2757, + "num_input_tokens_seen": 11170288, + "step": 17125 + }, + { + "epoch": 10.100235849056604, + "grad_norm": 2.2524056434631348, + "learning_rate": 5.782477248477535e-06, + "loss": 0.441, + "num_input_tokens_seen": 11173296, + "step": 17130 + }, + { + "epoch": 10.10318396226415, + "grad_norm": 1.926977515220642, + "learning_rate": 5.779936130132614e-06, + "loss": 0.3755, + "num_input_tokens_seen": 11177072, + "step": 17135 + }, + { + "epoch": 10.106132075471699, + "grad_norm": 3.904543161392212, + "learning_rate": 5.777394805296242e-06, + "loss": 0.3886, + "num_input_tokens_seen": 11180720, + "step": 17140 + }, + { + "epoch": 10.109080188679245, + "grad_norm": 4.499124050140381, + "learning_rate": 5.774853274641243e-06, + "loss": 0.3377, + "num_input_tokens_seen": 11184144, + "step": 17145 + }, + { + "epoch": 10.112028301886792, + "grad_norm": 2.6898717880249023, + "learning_rate": 5.772311538840501e-06, + "loss": 0.3566, + "num_input_tokens_seen": 11186928, + "step": 17150 + }, + { + "epoch": 10.11497641509434, + "grad_norm": 2.7684848308563232, + "learning_rate": 5.76976959856695e-06, + "loss": 0.375, + "num_input_tokens_seen": 11189808, + "step": 17155 + }, + { + "epoch": 10.117924528301886, + "grad_norm": 4.081057071685791, + "learning_rate": 5.767227454493582e-06, + "loss": 0.3196, + "num_input_tokens_seen": 11192816, + "step": 17160 + }, + { + "epoch": 10.120872641509434, + "grad_norm": 2.6376779079437256, + "learning_rate": 5.764685107293436e-06, + "loss": 0.371, + "num_input_tokens_seen": 11196240, + "step": 17165 + }, + { + "epoch": 10.12382075471698, + "grad_norm": 1.6304056644439697, + "learning_rate": 5.762142557639614e-06, + "loss": 0.2972, + "num_input_tokens_seen": 11199888, + "step": 17170 + }, + { + "epoch": 10.126768867924529, + "grad_norm": 2.4335174560546875, + "learning_rate": 5.759599806205266e-06, + "loss": 0.3689, + "num_input_tokens_seen": 11202864, + "step": 17175 + }, + { + "epoch": 10.129716981132075, + "grad_norm": 1.5857563018798828, + "learning_rate": 5.757056853663594e-06, + "loss": 0.3085, + "num_input_tokens_seen": 11205840, + "step": 17180 + }, + { + "epoch": 10.132665094339623, + "grad_norm": 3.1204757690429688, + "learning_rate": 5.754513700687858e-06, + "loss": 0.3276, + "num_input_tokens_seen": 11208112, + "step": 17185 + }, + { + "epoch": 10.13561320754717, + "grad_norm": 3.415346145629883, + "learning_rate": 5.751970347951369e-06, + "loss": 0.2732, + "num_input_tokens_seen": 11210608, + "step": 17190 + }, + { + "epoch": 10.138561320754716, + "grad_norm": 2.639396905899048, + "learning_rate": 5.7494267961274875e-06, + "loss": 0.4057, + "num_input_tokens_seen": 11213872, + "step": 17195 + }, + { + "epoch": 10.141509433962264, + "grad_norm": 4.661757946014404, + "learning_rate": 5.746883045889633e-06, + "loss": 0.2573, + "num_input_tokens_seen": 11216848, + "step": 17200 + }, + { + "epoch": 10.14445754716981, + "grad_norm": 2.5506951808929443, + "learning_rate": 5.744339097911272e-06, + "loss": 0.3192, + "num_input_tokens_seen": 11219792, + "step": 17205 + }, + { + "epoch": 10.147405660377359, + "grad_norm": 3.494114637374878, + "learning_rate": 5.741794952865928e-06, + "loss": 0.3808, + "num_input_tokens_seen": 11222544, + "step": 17210 + }, + { + "epoch": 10.150353773584905, + "grad_norm": 2.6537415981292725, + "learning_rate": 5.739250611427173e-06, + "loss": 0.4416, + "num_input_tokens_seen": 11225808, + "step": 17215 + }, + { + "epoch": 10.153301886792454, + "grad_norm": 4.799945831298828, + "learning_rate": 5.736706074268633e-06, + "loss": 0.5447, + "num_input_tokens_seen": 11229776, + "step": 17220 + }, + { + "epoch": 10.15625, + "grad_norm": 3.400574207305908, + "learning_rate": 5.734161342063984e-06, + "loss": 0.3847, + "num_input_tokens_seen": 11232976, + "step": 17225 + }, + { + "epoch": 10.159198113207546, + "grad_norm": 1.9110426902770996, + "learning_rate": 5.731616415486957e-06, + "loss": 0.3187, + "num_input_tokens_seen": 11235984, + "step": 17230 + }, + { + "epoch": 10.162146226415095, + "grad_norm": 2.2381649017333984, + "learning_rate": 5.72907129521133e-06, + "loss": 0.454, + "num_input_tokens_seen": 11239472, + "step": 17235 + }, + { + "epoch": 10.165094339622641, + "grad_norm": 3.986616373062134, + "learning_rate": 5.726525981910935e-06, + "loss": 0.3823, + "num_input_tokens_seen": 11243312, + "step": 17240 + }, + { + "epoch": 10.16804245283019, + "grad_norm": 1.8503303527832031, + "learning_rate": 5.723980476259658e-06, + "loss": 0.3964, + "num_input_tokens_seen": 11246736, + "step": 17245 + }, + { + "epoch": 10.170990566037736, + "grad_norm": 2.1283187866210938, + "learning_rate": 5.721434778931426e-06, + "loss": 0.4034, + "num_input_tokens_seen": 11249488, + "step": 17250 + }, + { + "epoch": 10.173938679245284, + "grad_norm": 2.6528496742248535, + "learning_rate": 5.7188888906002284e-06, + "loss": 0.3016, + "num_input_tokens_seen": 11253328, + "step": 17255 + }, + { + "epoch": 10.17688679245283, + "grad_norm": 2.4719715118408203, + "learning_rate": 5.716342811940098e-06, + "loss": 0.3584, + "num_input_tokens_seen": 11256944, + "step": 17260 + }, + { + "epoch": 10.179834905660377, + "grad_norm": 1.5065561532974243, + "learning_rate": 5.713796543625123e-06, + "loss": 0.3687, + "num_input_tokens_seen": 11260240, + "step": 17265 + }, + { + "epoch": 10.182783018867925, + "grad_norm": 2.675222635269165, + "learning_rate": 5.711250086329435e-06, + "loss": 0.2955, + "num_input_tokens_seen": 11262864, + "step": 17270 + }, + { + "epoch": 10.185731132075471, + "grad_norm": 3.1739578247070312, + "learning_rate": 5.7087034407272225e-06, + "loss": 0.3456, + "num_input_tokens_seen": 11265744, + "step": 17275 + }, + { + "epoch": 10.18867924528302, + "grad_norm": 4.033857822418213, + "learning_rate": 5.70615660749272e-06, + "loss": 0.3833, + "num_input_tokens_seen": 11268528, + "step": 17280 + }, + { + "epoch": 10.191627358490566, + "grad_norm": 6.158642768859863, + "learning_rate": 5.7036095873002106e-06, + "loss": 0.5305, + "num_input_tokens_seen": 11271824, + "step": 17285 + }, + { + "epoch": 10.194575471698114, + "grad_norm": 2.364011526107788, + "learning_rate": 5.701062380824032e-06, + "loss": 0.2883, + "num_input_tokens_seen": 11275216, + "step": 17290 + }, + { + "epoch": 10.19752358490566, + "grad_norm": 2.7802627086639404, + "learning_rate": 5.698514988738566e-06, + "loss": 0.3354, + "num_input_tokens_seen": 11278064, + "step": 17295 + }, + { + "epoch": 10.200471698113208, + "grad_norm": 2.436126947402954, + "learning_rate": 5.6959674117182465e-06, + "loss": 0.421, + "num_input_tokens_seen": 11282736, + "step": 17300 + }, + { + "epoch": 10.203419811320755, + "grad_norm": 4.761617660522461, + "learning_rate": 5.693419650437554e-06, + "loss": 0.402, + "num_input_tokens_seen": 11285200, + "step": 17305 + }, + { + "epoch": 10.206367924528301, + "grad_norm": 3.340930700302124, + "learning_rate": 5.690871705571022e-06, + "loss": 0.3823, + "num_input_tokens_seen": 11287376, + "step": 17310 + }, + { + "epoch": 10.20931603773585, + "grad_norm": 3.365468978881836, + "learning_rate": 5.688323577793229e-06, + "loss": 0.4477, + "num_input_tokens_seen": 11290288, + "step": 17315 + }, + { + "epoch": 10.212264150943396, + "grad_norm": 2.726369619369507, + "learning_rate": 5.685775267778801e-06, + "loss": 0.4168, + "num_input_tokens_seen": 11292976, + "step": 17320 + }, + { + "epoch": 10.215212264150944, + "grad_norm": 5.126517295837402, + "learning_rate": 5.683226776202416e-06, + "loss": 0.4327, + "num_input_tokens_seen": 11296656, + "step": 17325 + }, + { + "epoch": 10.21816037735849, + "grad_norm": 3.199646234512329, + "learning_rate": 5.680678103738798e-06, + "loss": 0.48, + "num_input_tokens_seen": 11299760, + "step": 17330 + }, + { + "epoch": 10.221108490566039, + "grad_norm": 4.901650905609131, + "learning_rate": 5.678129251062717e-06, + "loss": 0.3417, + "num_input_tokens_seen": 11302736, + "step": 17335 + }, + { + "epoch": 10.224056603773585, + "grad_norm": 1.9152823686599731, + "learning_rate": 5.675580218848995e-06, + "loss": 0.3446, + "num_input_tokens_seen": 11305968, + "step": 17340 + }, + { + "epoch": 10.227004716981131, + "grad_norm": 4.549933910369873, + "learning_rate": 5.673031007772498e-06, + "loss": 0.3756, + "num_input_tokens_seen": 11309040, + "step": 17345 + }, + { + "epoch": 10.22995283018868, + "grad_norm": 3.1808485984802246, + "learning_rate": 5.670481618508141e-06, + "loss": 0.3604, + "num_input_tokens_seen": 11311760, + "step": 17350 + }, + { + "epoch": 10.232900943396226, + "grad_norm": 5.211045265197754, + "learning_rate": 5.667932051730887e-06, + "loss": 0.341, + "num_input_tokens_seen": 11314704, + "step": 17355 + }, + { + "epoch": 10.235849056603774, + "grad_norm": 1.9566785097122192, + "learning_rate": 5.6653823081157434e-06, + "loss": 0.3543, + "num_input_tokens_seen": 11317808, + "step": 17360 + }, + { + "epoch": 10.23879716981132, + "grad_norm": 3.3826732635498047, + "learning_rate": 5.662832388337766e-06, + "loss": 0.397, + "num_input_tokens_seen": 11320816, + "step": 17365 + }, + { + "epoch": 10.241745283018869, + "grad_norm": 2.209038257598877, + "learning_rate": 5.660282293072057e-06, + "loss": 0.3498, + "num_input_tokens_seen": 11325232, + "step": 17370 + }, + { + "epoch": 10.244693396226415, + "grad_norm": 2.366002082824707, + "learning_rate": 5.657732022993765e-06, + "loss": 0.3768, + "num_input_tokens_seen": 11328368, + "step": 17375 + }, + { + "epoch": 10.247641509433961, + "grad_norm": 5.412224292755127, + "learning_rate": 5.655181578778085e-06, + "loss": 0.4547, + "num_input_tokens_seen": 11331312, + "step": 17380 + }, + { + "epoch": 10.25058962264151, + "grad_norm": 2.7893948554992676, + "learning_rate": 5.65263096110026e-06, + "loss": 0.2458, + "num_input_tokens_seen": 11334032, + "step": 17385 + }, + { + "epoch": 10.253537735849056, + "grad_norm": 2.4446887969970703, + "learning_rate": 5.650080170635573e-06, + "loss": 0.3472, + "num_input_tokens_seen": 11337328, + "step": 17390 + }, + { + "epoch": 10.256485849056604, + "grad_norm": 1.897607445716858, + "learning_rate": 5.647529208059359e-06, + "loss": 0.3258, + "num_input_tokens_seen": 11340240, + "step": 17395 + }, + { + "epoch": 10.25943396226415, + "grad_norm": 1.8987586498260498, + "learning_rate": 5.6449780740469985e-06, + "loss": 0.4787, + "num_input_tokens_seen": 11343536, + "step": 17400 + }, + { + "epoch": 10.262382075471699, + "grad_norm": 1.4581321477890015, + "learning_rate": 5.642426769273912e-06, + "loss": 0.3464, + "num_input_tokens_seen": 11346544, + "step": 17405 + }, + { + "epoch": 10.265330188679245, + "grad_norm": 2.3183844089508057, + "learning_rate": 5.63987529441557e-06, + "loss": 0.2924, + "num_input_tokens_seen": 11350096, + "step": 17410 + }, + { + "epoch": 10.268278301886792, + "grad_norm": 1.7459776401519775, + "learning_rate": 5.637323650147487e-06, + "loss": 0.4884, + "num_input_tokens_seen": 11352688, + "step": 17415 + }, + { + "epoch": 10.27122641509434, + "grad_norm": 2.4574997425079346, + "learning_rate": 5.63477183714522e-06, + "loss": 0.281, + "num_input_tokens_seen": 11355824, + "step": 17420 + }, + { + "epoch": 10.274174528301886, + "grad_norm": 3.4797680377960205, + "learning_rate": 5.632219856084373e-06, + "loss": 0.4866, + "num_input_tokens_seen": 11359152, + "step": 17425 + }, + { + "epoch": 10.277122641509434, + "grad_norm": 1.973578929901123, + "learning_rate": 5.6296677076405944e-06, + "loss": 0.4205, + "num_input_tokens_seen": 11362320, + "step": 17430 + }, + { + "epoch": 10.28007075471698, + "grad_norm": 2.9002130031585693, + "learning_rate": 5.627115392489578e-06, + "loss": 0.3303, + "num_input_tokens_seen": 11366608, + "step": 17435 + }, + { + "epoch": 10.283018867924529, + "grad_norm": 4.200732231140137, + "learning_rate": 5.624562911307058e-06, + "loss": 0.3174, + "num_input_tokens_seen": 11369616, + "step": 17440 + }, + { + "epoch": 10.285966981132075, + "grad_norm": 2.2556450366973877, + "learning_rate": 5.622010264768813e-06, + "loss": 0.4381, + "num_input_tokens_seen": 11372624, + "step": 17445 + }, + { + "epoch": 10.288915094339623, + "grad_norm": 2.140825033187866, + "learning_rate": 5.619457453550673e-06, + "loss": 0.2626, + "num_input_tokens_seen": 11376144, + "step": 17450 + }, + { + "epoch": 10.29186320754717, + "grad_norm": 2.6157894134521484, + "learning_rate": 5.616904478328502e-06, + "loss": 0.4153, + "num_input_tokens_seen": 11379216, + "step": 17455 + }, + { + "epoch": 10.294811320754716, + "grad_norm": 2.5347180366516113, + "learning_rate": 5.6143513397782104e-06, + "loss": 0.4119, + "num_input_tokens_seen": 11382032, + "step": 17460 + }, + { + "epoch": 10.297759433962264, + "grad_norm": 5.237630844116211, + "learning_rate": 5.611798038575755e-06, + "loss": 0.4468, + "num_input_tokens_seen": 11384752, + "step": 17465 + }, + { + "epoch": 10.30070754716981, + "grad_norm": 2.3017730712890625, + "learning_rate": 5.609244575397131e-06, + "loss": 0.2419, + "num_input_tokens_seen": 11388048, + "step": 17470 + }, + { + "epoch": 10.303655660377359, + "grad_norm": 2.804633617401123, + "learning_rate": 5.606690950918381e-06, + "loss": 0.3855, + "num_input_tokens_seen": 11391152, + "step": 17475 + }, + { + "epoch": 10.306603773584905, + "grad_norm": 2.553508996963501, + "learning_rate": 5.604137165815586e-06, + "loss": 0.3195, + "num_input_tokens_seen": 11394864, + "step": 17480 + }, + { + "epoch": 10.309551886792454, + "grad_norm": 4.214234828948975, + "learning_rate": 5.601583220764874e-06, + "loss": 0.3805, + "num_input_tokens_seen": 11399984, + "step": 17485 + }, + { + "epoch": 10.3125, + "grad_norm": 2.570779323577881, + "learning_rate": 5.599029116442409e-06, + "loss": 0.3897, + "num_input_tokens_seen": 11402864, + "step": 17490 + }, + { + "epoch": 10.315448113207546, + "grad_norm": 3.488511323928833, + "learning_rate": 5.596474853524406e-06, + "loss": 0.3776, + "num_input_tokens_seen": 11405232, + "step": 17495 + }, + { + "epoch": 10.318396226415095, + "grad_norm": 4.43650484085083, + "learning_rate": 5.593920432687115e-06, + "loss": 0.5069, + "num_input_tokens_seen": 11408048, + "step": 17500 + }, + { + "epoch": 10.321344339622641, + "grad_norm": 3.2700893878936768, + "learning_rate": 5.591365854606829e-06, + "loss": 0.3743, + "num_input_tokens_seen": 11412016, + "step": 17505 + }, + { + "epoch": 10.32429245283019, + "grad_norm": 2.827975034713745, + "learning_rate": 5.588811119959885e-06, + "loss": 0.3291, + "num_input_tokens_seen": 11414960, + "step": 17510 + }, + { + "epoch": 10.327240566037736, + "grad_norm": 2.1097817420959473, + "learning_rate": 5.58625622942266e-06, + "loss": 0.684, + "num_input_tokens_seen": 11417840, + "step": 17515 + }, + { + "epoch": 10.330188679245284, + "grad_norm": 2.6729228496551514, + "learning_rate": 5.58370118367157e-06, + "loss": 0.2969, + "num_input_tokens_seen": 11421360, + "step": 17520 + }, + { + "epoch": 10.33313679245283, + "grad_norm": 2.0294246673583984, + "learning_rate": 5.581145983383077e-06, + "loss": 0.4068, + "num_input_tokens_seen": 11424368, + "step": 17525 + }, + { + "epoch": 10.336084905660377, + "grad_norm": 2.1000876426696777, + "learning_rate": 5.57859062923368e-06, + "loss": 0.2624, + "num_input_tokens_seen": 11427056, + "step": 17530 + }, + { + "epoch": 10.339033018867925, + "grad_norm": 1.9153977632522583, + "learning_rate": 5.5760351218999194e-06, + "loss": 0.3341, + "num_input_tokens_seen": 11431152, + "step": 17535 + }, + { + "epoch": 10.341981132075471, + "grad_norm": 3.605034828186035, + "learning_rate": 5.573479462058379e-06, + "loss": 0.4114, + "num_input_tokens_seen": 11433680, + "step": 17540 + }, + { + "epoch": 10.34492924528302, + "grad_norm": 2.6693365573883057, + "learning_rate": 5.570923650385679e-06, + "loss": 0.3669, + "num_input_tokens_seen": 11436208, + "step": 17545 + }, + { + "epoch": 10.347877358490566, + "grad_norm": 3.5500359535217285, + "learning_rate": 5.568367687558481e-06, + "loss": 0.4004, + "num_input_tokens_seen": 11438384, + "step": 17550 + }, + { + "epoch": 10.350825471698114, + "grad_norm": 5.69774055480957, + "learning_rate": 5.56581157425349e-06, + "loss": 0.4195, + "num_input_tokens_seen": 11440976, + "step": 17555 + }, + { + "epoch": 10.35377358490566, + "grad_norm": 2.398529291152954, + "learning_rate": 5.563255311147446e-06, + "loss": 0.3413, + "num_input_tokens_seen": 11444816, + "step": 17560 + }, + { + "epoch": 10.356721698113208, + "grad_norm": 4.133174419403076, + "learning_rate": 5.560698898917129e-06, + "loss": 0.3753, + "num_input_tokens_seen": 11447376, + "step": 17565 + }, + { + "epoch": 10.359669811320755, + "grad_norm": 2.2507314682006836, + "learning_rate": 5.558142338239365e-06, + "loss": 0.3708, + "num_input_tokens_seen": 11450896, + "step": 17570 + }, + { + "epoch": 10.362617924528301, + "grad_norm": 1.8855146169662476, + "learning_rate": 5.55558562979101e-06, + "loss": 0.4119, + "num_input_tokens_seen": 11453584, + "step": 17575 + }, + { + "epoch": 10.36556603773585, + "grad_norm": 3.634397506713867, + "learning_rate": 5.553028774248964e-06, + "loss": 0.4158, + "num_input_tokens_seen": 11456272, + "step": 17580 + }, + { + "epoch": 10.368514150943396, + "grad_norm": 3.053724765777588, + "learning_rate": 5.5504717722901665e-06, + "loss": 0.3593, + "num_input_tokens_seen": 11459216, + "step": 17585 + }, + { + "epoch": 10.371462264150944, + "grad_norm": 2.730050802230835, + "learning_rate": 5.547914624591597e-06, + "loss": 0.5275, + "num_input_tokens_seen": 11462160, + "step": 17590 + }, + { + "epoch": 10.37441037735849, + "grad_norm": 3.473198890686035, + "learning_rate": 5.545357331830269e-06, + "loss": 0.4625, + "num_input_tokens_seen": 11465488, + "step": 17595 + }, + { + "epoch": 10.377358490566039, + "grad_norm": 2.4932329654693604, + "learning_rate": 5.542799894683235e-06, + "loss": 0.3093, + "num_input_tokens_seen": 11467792, + "step": 17600 + }, + { + "epoch": 10.380306603773585, + "grad_norm": 3.3443124294281006, + "learning_rate": 5.540242313827591e-06, + "loss": 0.3896, + "num_input_tokens_seen": 11470896, + "step": 17605 + }, + { + "epoch": 10.383254716981131, + "grad_norm": 8.577765464782715, + "learning_rate": 5.537684589940466e-06, + "loss": 0.3657, + "num_input_tokens_seen": 11473648, + "step": 17610 + }, + { + "epoch": 10.38620283018868, + "grad_norm": 5.007880687713623, + "learning_rate": 5.535126723699029e-06, + "loss": 0.4337, + "num_input_tokens_seen": 11477136, + "step": 17615 + }, + { + "epoch": 10.389150943396226, + "grad_norm": 3.1569900512695312, + "learning_rate": 5.532568715780485e-06, + "loss": 0.3108, + "num_input_tokens_seen": 11480080, + "step": 17620 + }, + { + "epoch": 10.392099056603774, + "grad_norm": 2.7042129039764404, + "learning_rate": 5.530010566862077e-06, + "loss": 0.4411, + "num_input_tokens_seen": 11482992, + "step": 17625 + }, + { + "epoch": 10.39504716981132, + "grad_norm": 2.844331741333008, + "learning_rate": 5.527452277621089e-06, + "loss": 0.3397, + "num_input_tokens_seen": 11486512, + "step": 17630 + }, + { + "epoch": 10.397995283018869, + "grad_norm": 2.6111152172088623, + "learning_rate": 5.524893848734837e-06, + "loss": 0.4316, + "num_input_tokens_seen": 11490448, + "step": 17635 + }, + { + "epoch": 10.400943396226415, + "grad_norm": 2.7033448219299316, + "learning_rate": 5.522335280880676e-06, + "loss": 0.4368, + "num_input_tokens_seen": 11493520, + "step": 17640 + }, + { + "epoch": 10.403891509433961, + "grad_norm": 3.7705066204071045, + "learning_rate": 5.519776574735999e-06, + "loss": 0.3717, + "num_input_tokens_seen": 11497232, + "step": 17645 + }, + { + "epoch": 10.40683962264151, + "grad_norm": 2.8128511905670166, + "learning_rate": 5.5172177309782325e-06, + "loss": 0.3409, + "num_input_tokens_seen": 11501072, + "step": 17650 + }, + { + "epoch": 10.409787735849056, + "grad_norm": 3.1056880950927734, + "learning_rate": 5.514658750284844e-06, + "loss": 0.4006, + "num_input_tokens_seen": 11504880, + "step": 17655 + }, + { + "epoch": 10.412735849056604, + "grad_norm": 2.853879451751709, + "learning_rate": 5.512099633333332e-06, + "loss": 0.317, + "num_input_tokens_seen": 11508336, + "step": 17660 + }, + { + "epoch": 10.41568396226415, + "grad_norm": 4.7368974685668945, + "learning_rate": 5.509540380801236e-06, + "loss": 0.4241, + "num_input_tokens_seen": 11511344, + "step": 17665 + }, + { + "epoch": 10.418632075471699, + "grad_norm": 4.0707902908325195, + "learning_rate": 5.506980993366129e-06, + "loss": 0.3733, + "num_input_tokens_seen": 11515984, + "step": 17670 + }, + { + "epoch": 10.421580188679245, + "grad_norm": 2.349018096923828, + "learning_rate": 5.504421471705616e-06, + "loss": 0.4111, + "num_input_tokens_seen": 11518992, + "step": 17675 + }, + { + "epoch": 10.424528301886792, + "grad_norm": 6.048584461212158, + "learning_rate": 5.501861816497346e-06, + "loss": 0.4107, + "num_input_tokens_seen": 11523312, + "step": 17680 + }, + { + "epoch": 10.42747641509434, + "grad_norm": 2.32818865776062, + "learning_rate": 5.499302028418998e-06, + "loss": 0.3727, + "num_input_tokens_seen": 11526960, + "step": 17685 + }, + { + "epoch": 10.430424528301886, + "grad_norm": 1.6205062866210938, + "learning_rate": 5.496742108148285e-06, + "loss": 0.4616, + "num_input_tokens_seen": 11530096, + "step": 17690 + }, + { + "epoch": 10.433372641509434, + "grad_norm": 3.2586190700531006, + "learning_rate": 5.494182056362959e-06, + "loss": 0.308, + "num_input_tokens_seen": 11533328, + "step": 17695 + }, + { + "epoch": 10.43632075471698, + "grad_norm": 3.2934730052948, + "learning_rate": 5.491621873740804e-06, + "loss": 0.4698, + "num_input_tokens_seen": 11536336, + "step": 17700 + }, + { + "epoch": 10.439268867924529, + "grad_norm": 1.8386735916137695, + "learning_rate": 5.4890615609596375e-06, + "loss": 0.5364, + "num_input_tokens_seen": 11538928, + "step": 17705 + }, + { + "epoch": 10.442216981132075, + "grad_norm": 2.0543055534362793, + "learning_rate": 5.486501118697317e-06, + "loss": 0.3848, + "num_input_tokens_seen": 11541520, + "step": 17710 + }, + { + "epoch": 10.445165094339623, + "grad_norm": 1.2610985040664673, + "learning_rate": 5.483940547631727e-06, + "loss": 0.3601, + "num_input_tokens_seen": 11544976, + "step": 17715 + }, + { + "epoch": 10.44811320754717, + "grad_norm": 1.874112844467163, + "learning_rate": 5.481379848440792e-06, + "loss": 0.5124, + "num_input_tokens_seen": 11549072, + "step": 17720 + }, + { + "epoch": 10.451061320754716, + "grad_norm": 1.7469552755355835, + "learning_rate": 5.478819021802468e-06, + "loss": 0.3794, + "num_input_tokens_seen": 11552080, + "step": 17725 + }, + { + "epoch": 10.454009433962264, + "grad_norm": 3.6463067531585693, + "learning_rate": 5.476258068394743e-06, + "loss": 0.2933, + "num_input_tokens_seen": 11555920, + "step": 17730 + }, + { + "epoch": 10.45695754716981, + "grad_norm": 2.2082536220550537, + "learning_rate": 5.473696988895644e-06, + "loss": 0.2379, + "num_input_tokens_seen": 11560112, + "step": 17735 + }, + { + "epoch": 10.459905660377359, + "grad_norm": 3.2026994228363037, + "learning_rate": 5.471135783983224e-06, + "loss": 0.3868, + "num_input_tokens_seen": 11563184, + "step": 17740 + }, + { + "epoch": 10.462853773584905, + "grad_norm": 2.426938056945801, + "learning_rate": 5.4685744543355745e-06, + "loss": 0.341, + "num_input_tokens_seen": 11566384, + "step": 17745 + }, + { + "epoch": 10.465801886792454, + "grad_norm": 2.6454083919525146, + "learning_rate": 5.466013000630819e-06, + "loss": 0.3955, + "num_input_tokens_seen": 11569392, + "step": 17750 + }, + { + "epoch": 10.46875, + "grad_norm": 1.6985721588134766, + "learning_rate": 5.463451423547114e-06, + "loss": 0.3669, + "num_input_tokens_seen": 11573264, + "step": 17755 + }, + { + "epoch": 10.471698113207546, + "grad_norm": 1.9474225044250488, + "learning_rate": 5.460889723762647e-06, + "loss": 0.4188, + "num_input_tokens_seen": 11576816, + "step": 17760 + }, + { + "epoch": 10.474646226415095, + "grad_norm": 3.021440267562866, + "learning_rate": 5.458327901955639e-06, + "loss": 0.3481, + "num_input_tokens_seen": 11580656, + "step": 17765 + }, + { + "epoch": 10.477594339622641, + "grad_norm": 2.984830379486084, + "learning_rate": 5.455765958804344e-06, + "loss": 0.3316, + "num_input_tokens_seen": 11583504, + "step": 17770 + }, + { + "epoch": 10.48054245283019, + "grad_norm": 1.9020577669143677, + "learning_rate": 5.4532038949870455e-06, + "loss": 0.3063, + "num_input_tokens_seen": 11587056, + "step": 17775 + }, + { + "epoch": 10.483490566037736, + "grad_norm": 1.5156751871109009, + "learning_rate": 5.450641711182066e-06, + "loss": 0.4263, + "num_input_tokens_seen": 11590448, + "step": 17780 + }, + { + "epoch": 10.486438679245284, + "grad_norm": 3.9102132320404053, + "learning_rate": 5.448079408067748e-06, + "loss": 0.3105, + "num_input_tokens_seen": 11593200, + "step": 17785 + }, + { + "epoch": 10.48938679245283, + "grad_norm": 2.7179441452026367, + "learning_rate": 5.4455169863224775e-06, + "loss": 0.3819, + "num_input_tokens_seen": 11596880, + "step": 17790 + }, + { + "epoch": 10.492334905660377, + "grad_norm": 2.3332467079162598, + "learning_rate": 5.442954446624664e-06, + "loss": 0.4312, + "num_input_tokens_seen": 11599632, + "step": 17795 + }, + { + "epoch": 10.495283018867925, + "grad_norm": 4.443470478057861, + "learning_rate": 5.440391789652752e-06, + "loss": 0.5003, + "num_input_tokens_seen": 11602416, + "step": 17800 + }, + { + "epoch": 10.498231132075471, + "grad_norm": 6.064513683319092, + "learning_rate": 5.437829016085216e-06, + "loss": 0.271, + "num_input_tokens_seen": 11605008, + "step": 17805 + }, + { + "epoch": 10.50117924528302, + "grad_norm": 3.5286128520965576, + "learning_rate": 5.435266126600561e-06, + "loss": 0.3225, + "num_input_tokens_seen": 11607856, + "step": 17810 + }, + { + "epoch": 10.504127358490566, + "grad_norm": 1.9397250413894653, + "learning_rate": 5.4327031218773215e-06, + "loss": 0.4079, + "num_input_tokens_seen": 11611184, + "step": 17815 + }, + { + "epoch": 10.507075471698114, + "grad_norm": 2.8401334285736084, + "learning_rate": 5.430140002594067e-06, + "loss": 0.3475, + "num_input_tokens_seen": 11614704, + "step": 17820 + }, + { + "epoch": 10.51002358490566, + "grad_norm": 6.736853122711182, + "learning_rate": 5.4275767694293934e-06, + "loss": 0.378, + "num_input_tokens_seen": 11617584, + "step": 17825 + }, + { + "epoch": 10.512971698113208, + "grad_norm": 2.2772741317749023, + "learning_rate": 5.425013423061926e-06, + "loss": 0.4474, + "num_input_tokens_seen": 11620464, + "step": 17830 + }, + { + "epoch": 10.515919811320755, + "grad_norm": 4.8379902839660645, + "learning_rate": 5.422449964170324e-06, + "loss": 0.2538, + "num_input_tokens_seen": 11623696, + "step": 17835 + }, + { + "epoch": 10.518867924528301, + "grad_norm": 3.0294594764709473, + "learning_rate": 5.419886393433275e-06, + "loss": 0.397, + "num_input_tokens_seen": 11626928, + "step": 17840 + }, + { + "epoch": 10.52181603773585, + "grad_norm": 1.425459861755371, + "learning_rate": 5.417322711529491e-06, + "loss": 0.3661, + "num_input_tokens_seen": 11629808, + "step": 17845 + }, + { + "epoch": 10.524764150943396, + "grad_norm": 2.107189655303955, + "learning_rate": 5.4147589191377224e-06, + "loss": 0.3338, + "num_input_tokens_seen": 11633584, + "step": 17850 + }, + { + "epoch": 10.527712264150944, + "grad_norm": 2.7019550800323486, + "learning_rate": 5.412195016936742e-06, + "loss": 0.5934, + "num_input_tokens_seen": 11636816, + "step": 17855 + }, + { + "epoch": 10.53066037735849, + "grad_norm": 2.7533226013183594, + "learning_rate": 5.409631005605354e-06, + "loss": 0.4128, + "num_input_tokens_seen": 11640048, + "step": 17860 + }, + { + "epoch": 10.533608490566039, + "grad_norm": 2.909158229827881, + "learning_rate": 5.407066885822391e-06, + "loss": 0.2739, + "num_input_tokens_seen": 11643184, + "step": 17865 + }, + { + "epoch": 10.536556603773585, + "grad_norm": 2.6675121784210205, + "learning_rate": 5.404502658266717e-06, + "loss": 0.3957, + "num_input_tokens_seen": 11646608, + "step": 17870 + }, + { + "epoch": 10.539504716981131, + "grad_norm": 3.662813663482666, + "learning_rate": 5.4019383236172195e-06, + "loss": 0.4263, + "num_input_tokens_seen": 11652528, + "step": 17875 + }, + { + "epoch": 10.54245283018868, + "grad_norm": 1.8791899681091309, + "learning_rate": 5.39937388255282e-06, + "loss": 0.4047, + "num_input_tokens_seen": 11656016, + "step": 17880 + }, + { + "epoch": 10.545400943396226, + "grad_norm": 1.9273725748062134, + "learning_rate": 5.3968093357524645e-06, + "loss": 0.3189, + "num_input_tokens_seen": 11659152, + "step": 17885 + }, + { + "epoch": 10.548349056603774, + "grad_norm": 2.5352768898010254, + "learning_rate": 5.3942446838951245e-06, + "loss": 0.3136, + "num_input_tokens_seen": 11662416, + "step": 17890 + }, + { + "epoch": 10.55129716981132, + "grad_norm": 4.709670543670654, + "learning_rate": 5.3916799276598074e-06, + "loss": 0.4149, + "num_input_tokens_seen": 11666288, + "step": 17895 + }, + { + "epoch": 10.554245283018869, + "grad_norm": 2.5742149353027344, + "learning_rate": 5.3891150677255425e-06, + "loss": 0.3559, + "num_input_tokens_seen": 11669648, + "step": 17900 + }, + { + "epoch": 10.557193396226415, + "grad_norm": 2.2333507537841797, + "learning_rate": 5.386550104771384e-06, + "loss": 0.4641, + "num_input_tokens_seen": 11673808, + "step": 17905 + }, + { + "epoch": 10.560141509433961, + "grad_norm": 3.6738831996917725, + "learning_rate": 5.3839850394764205e-06, + "loss": 0.3883, + "num_input_tokens_seen": 11677136, + "step": 17910 + }, + { + "epoch": 10.56308962264151, + "grad_norm": 1.8079829216003418, + "learning_rate": 5.381419872519763e-06, + "loss": 0.307, + "num_input_tokens_seen": 11679984, + "step": 17915 + }, + { + "epoch": 10.566037735849056, + "grad_norm": 2.831439733505249, + "learning_rate": 5.378854604580549e-06, + "loss": 0.273, + "num_input_tokens_seen": 11682992, + "step": 17920 + }, + { + "epoch": 10.568985849056604, + "grad_norm": 2.5175082683563232, + "learning_rate": 5.376289236337946e-06, + "loss": 0.3645, + "num_input_tokens_seen": 11687024, + "step": 17925 + }, + { + "epoch": 10.57193396226415, + "grad_norm": 1.7944520711898804, + "learning_rate": 5.373723768471147e-06, + "loss": 0.3354, + "num_input_tokens_seen": 11690160, + "step": 17930 + }, + { + "epoch": 10.574882075471699, + "grad_norm": 2.9446234703063965, + "learning_rate": 5.37115820165937e-06, + "loss": 0.3308, + "num_input_tokens_seen": 11694256, + "step": 17935 + }, + { + "epoch": 10.577830188679245, + "grad_norm": 3.007157802581787, + "learning_rate": 5.368592536581858e-06, + "loss": 0.2975, + "num_input_tokens_seen": 11697808, + "step": 17940 + }, + { + "epoch": 10.580778301886792, + "grad_norm": 3.278438091278076, + "learning_rate": 5.366026773917885e-06, + "loss": 0.337, + "num_input_tokens_seen": 11701904, + "step": 17945 + }, + { + "epoch": 10.58372641509434, + "grad_norm": 2.635357618331909, + "learning_rate": 5.363460914346746e-06, + "loss": 0.325, + "num_input_tokens_seen": 11704464, + "step": 17950 + }, + { + "epoch": 10.586674528301886, + "grad_norm": 3.6010220050811768, + "learning_rate": 5.360894958547762e-06, + "loss": 0.3533, + "num_input_tokens_seen": 11707408, + "step": 17955 + }, + { + "epoch": 10.589622641509434, + "grad_norm": 2.788458824157715, + "learning_rate": 5.358328907200284e-06, + "loss": 0.3972, + "num_input_tokens_seen": 11710544, + "step": 17960 + }, + { + "epoch": 10.59257075471698, + "grad_norm": 1.6266142129898071, + "learning_rate": 5.355762760983682e-06, + "loss": 0.2769, + "num_input_tokens_seen": 11714768, + "step": 17965 + }, + { + "epoch": 10.595518867924529, + "grad_norm": 2.9958503246307373, + "learning_rate": 5.353196520577356e-06, + "loss": 0.4167, + "num_input_tokens_seen": 11718256, + "step": 17970 + }, + { + "epoch": 10.598466981132075, + "grad_norm": 2.616300106048584, + "learning_rate": 5.35063018666073e-06, + "loss": 0.4275, + "num_input_tokens_seen": 11721040, + "step": 17975 + }, + { + "epoch": 10.601415094339622, + "grad_norm": 2.9427073001861572, + "learning_rate": 5.3480637599132515e-06, + "loss": 0.4001, + "num_input_tokens_seen": 11724592, + "step": 17980 + }, + { + "epoch": 10.60436320754717, + "grad_norm": 2.1944713592529297, + "learning_rate": 5.34549724101439e-06, + "loss": 0.3692, + "num_input_tokens_seen": 11729200, + "step": 17985 + }, + { + "epoch": 10.607311320754716, + "grad_norm": 3.8543853759765625, + "learning_rate": 5.342930630643646e-06, + "loss": 0.4728, + "num_input_tokens_seen": 11732368, + "step": 17990 + }, + { + "epoch": 10.610259433962264, + "grad_norm": 1.748189091682434, + "learning_rate": 5.340363929480541e-06, + "loss": 0.3403, + "num_input_tokens_seen": 11735856, + "step": 17995 + }, + { + "epoch": 10.61320754716981, + "grad_norm": 2.9466164112091064, + "learning_rate": 5.3377971382046164e-06, + "loss": 0.4121, + "num_input_tokens_seen": 11739184, + "step": 18000 + }, + { + "epoch": 10.616155660377359, + "grad_norm": 4.3717427253723145, + "learning_rate": 5.335230257495446e-06, + "loss": 0.4749, + "num_input_tokens_seen": 11741680, + "step": 18005 + }, + { + "epoch": 10.619103773584905, + "grad_norm": 1.578360676765442, + "learning_rate": 5.3326632880326205e-06, + "loss": 0.3491, + "num_input_tokens_seen": 11745616, + "step": 18010 + }, + { + "epoch": 10.622051886792454, + "grad_norm": 3.349447727203369, + "learning_rate": 5.3300962304957515e-06, + "loss": 0.3061, + "num_input_tokens_seen": 11748528, + "step": 18015 + }, + { + "epoch": 10.625, + "grad_norm": 3.4133248329162598, + "learning_rate": 5.327529085564487e-06, + "loss": 0.4662, + "num_input_tokens_seen": 11751056, + "step": 18020 + }, + { + "epoch": 10.627948113207546, + "grad_norm": 2.131540060043335, + "learning_rate": 5.324961853918485e-06, + "loss": 0.3393, + "num_input_tokens_seen": 11754864, + "step": 18025 + }, + { + "epoch": 10.630896226415095, + "grad_norm": 2.8473474979400635, + "learning_rate": 5.32239453623743e-06, + "loss": 0.371, + "num_input_tokens_seen": 11758256, + "step": 18030 + }, + { + "epoch": 10.633844339622641, + "grad_norm": 4.949124813079834, + "learning_rate": 5.3198271332010335e-06, + "loss": 0.3751, + "num_input_tokens_seen": 11762096, + "step": 18035 + }, + { + "epoch": 10.63679245283019, + "grad_norm": 3.5244696140289307, + "learning_rate": 5.317259645489024e-06, + "loss": 0.3324, + "num_input_tokens_seen": 11765872, + "step": 18040 + }, + { + "epoch": 10.639740566037736, + "grad_norm": 2.2902731895446777, + "learning_rate": 5.314692073781157e-06, + "loss": 0.2893, + "num_input_tokens_seen": 11768528, + "step": 18045 + }, + { + "epoch": 10.642688679245284, + "grad_norm": 1.5570361614227295, + "learning_rate": 5.312124418757207e-06, + "loss": 0.3557, + "num_input_tokens_seen": 11772208, + "step": 18050 + }, + { + "epoch": 10.64563679245283, + "grad_norm": 2.6260485649108887, + "learning_rate": 5.309556681096972e-06, + "loss": 0.5397, + "num_input_tokens_seen": 11776528, + "step": 18055 + }, + { + "epoch": 10.648584905660378, + "grad_norm": 2.0930910110473633, + "learning_rate": 5.306988861480271e-06, + "loss": 0.3975, + "num_input_tokens_seen": 11780176, + "step": 18060 + }, + { + "epoch": 10.651533018867925, + "grad_norm": 2.8467299938201904, + "learning_rate": 5.304420960586946e-06, + "loss": 0.3357, + "num_input_tokens_seen": 11783216, + "step": 18065 + }, + { + "epoch": 10.654481132075471, + "grad_norm": 1.5697345733642578, + "learning_rate": 5.3018529790968606e-06, + "loss": 0.3338, + "num_input_tokens_seen": 11789904, + "step": 18070 + }, + { + "epoch": 10.65742924528302, + "grad_norm": 2.2542638778686523, + "learning_rate": 5.299284917689898e-06, + "loss": 0.4727, + "num_input_tokens_seen": 11793040, + "step": 18075 + }, + { + "epoch": 10.660377358490566, + "grad_norm": 5.108577728271484, + "learning_rate": 5.296716777045962e-06, + "loss": 0.3884, + "num_input_tokens_seen": 11796336, + "step": 18080 + }, + { + "epoch": 10.663325471698114, + "grad_norm": 2.5337820053100586, + "learning_rate": 5.294148557844983e-06, + "loss": 0.3869, + "num_input_tokens_seen": 11799568, + "step": 18085 + }, + { + "epoch": 10.66627358490566, + "grad_norm": 2.8313708305358887, + "learning_rate": 5.291580260766904e-06, + "loss": 0.3317, + "num_input_tokens_seen": 11802960, + "step": 18090 + }, + { + "epoch": 10.669221698113208, + "grad_norm": 2.5068671703338623, + "learning_rate": 5.289011886491694e-06, + "loss": 0.3257, + "num_input_tokens_seen": 11807088, + "step": 18095 + }, + { + "epoch": 10.672169811320755, + "grad_norm": 2.98797607421875, + "learning_rate": 5.286443435699342e-06, + "loss": 0.3056, + "num_input_tokens_seen": 11811920, + "step": 18100 + }, + { + "epoch": 10.675117924528301, + "grad_norm": 1.6492878198623657, + "learning_rate": 5.283874909069855e-06, + "loss": 0.407, + "num_input_tokens_seen": 11814800, + "step": 18105 + }, + { + "epoch": 10.67806603773585, + "grad_norm": 2.4847452640533447, + "learning_rate": 5.281306307283263e-06, + "loss": 0.445, + "num_input_tokens_seen": 11818224, + "step": 18110 + }, + { + "epoch": 10.681014150943396, + "grad_norm": 3.850813865661621, + "learning_rate": 5.2787376310196145e-06, + "loss": 0.3669, + "num_input_tokens_seen": 11821424, + "step": 18115 + }, + { + "epoch": 10.683962264150944, + "grad_norm": 4.274418830871582, + "learning_rate": 5.276168880958977e-06, + "loss": 0.355, + "num_input_tokens_seen": 11823888, + "step": 18120 + }, + { + "epoch": 10.68691037735849, + "grad_norm": 6.18873929977417, + "learning_rate": 5.273600057781437e-06, + "loss": 0.3494, + "num_input_tokens_seen": 11826896, + "step": 18125 + }, + { + "epoch": 10.689858490566039, + "grad_norm": 4.320852756500244, + "learning_rate": 5.271031162167103e-06, + "loss": 0.362, + "num_input_tokens_seen": 11830224, + "step": 18130 + }, + { + "epoch": 10.692806603773585, + "grad_norm": 2.789550304412842, + "learning_rate": 5.268462194796101e-06, + "loss": 0.3199, + "num_input_tokens_seen": 11833456, + "step": 18135 + }, + { + "epoch": 10.695754716981131, + "grad_norm": 2.754539966583252, + "learning_rate": 5.265893156348576e-06, + "loss": 0.4827, + "num_input_tokens_seen": 11836848, + "step": 18140 + }, + { + "epoch": 10.69870283018868, + "grad_norm": 3.3421356678009033, + "learning_rate": 5.2633240475046925e-06, + "loss": 0.4115, + "num_input_tokens_seen": 11839504, + "step": 18145 + }, + { + "epoch": 10.701650943396226, + "grad_norm": 2.5830318927764893, + "learning_rate": 5.2607548689446305e-06, + "loss": 0.4453, + "num_input_tokens_seen": 11842640, + "step": 18150 + }, + { + "epoch": 10.704599056603774, + "grad_norm": 5.124602794647217, + "learning_rate": 5.258185621348595e-06, + "loss": 0.3442, + "num_input_tokens_seen": 11844592, + "step": 18155 + }, + { + "epoch": 10.70754716981132, + "grad_norm": 2.321617603302002, + "learning_rate": 5.255616305396801e-06, + "loss": 0.5397, + "num_input_tokens_seen": 11847696, + "step": 18160 + }, + { + "epoch": 10.710495283018869, + "grad_norm": 4.331507205963135, + "learning_rate": 5.253046921769491e-06, + "loss": 0.3407, + "num_input_tokens_seen": 11850672, + "step": 18165 + }, + { + "epoch": 10.713443396226415, + "grad_norm": 2.513171911239624, + "learning_rate": 5.250477471146916e-06, + "loss": 0.3258, + "num_input_tokens_seen": 11854480, + "step": 18170 + }, + { + "epoch": 10.716391509433961, + "grad_norm": 3.0567197799682617, + "learning_rate": 5.2479079542093535e-06, + "loss": 0.2342, + "num_input_tokens_seen": 11858032, + "step": 18175 + }, + { + "epoch": 10.71933962264151, + "grad_norm": 3.2821991443634033, + "learning_rate": 5.245338371637091e-06, + "loss": 0.3329, + "num_input_tokens_seen": 11861296, + "step": 18180 + }, + { + "epoch": 10.722287735849056, + "grad_norm": 3.178295612335205, + "learning_rate": 5.242768724110437e-06, + "loss": 0.4341, + "num_input_tokens_seen": 11863344, + "step": 18185 + }, + { + "epoch": 10.725235849056604, + "grad_norm": 3.619694232940674, + "learning_rate": 5.240199012309717e-06, + "loss": 0.3417, + "num_input_tokens_seen": 11866704, + "step": 18190 + }, + { + "epoch": 10.72818396226415, + "grad_norm": 2.2598514556884766, + "learning_rate": 5.237629236915273e-06, + "loss": 0.3639, + "num_input_tokens_seen": 11869648, + "step": 18195 + }, + { + "epoch": 10.731132075471699, + "grad_norm": 2.342189073562622, + "learning_rate": 5.235059398607464e-06, + "loss": 0.4036, + "num_input_tokens_seen": 11873040, + "step": 18200 + }, + { + "epoch": 10.734080188679245, + "grad_norm": 2.396108388900757, + "learning_rate": 5.232489498066665e-06, + "loss": 0.328, + "num_input_tokens_seen": 11876176, + "step": 18205 + }, + { + "epoch": 10.737028301886792, + "grad_norm": 3.746102809906006, + "learning_rate": 5.229919535973272e-06, + "loss": 0.4602, + "num_input_tokens_seen": 11878864, + "step": 18210 + }, + { + "epoch": 10.73997641509434, + "grad_norm": 4.024622917175293, + "learning_rate": 5.2273495130076905e-06, + "loss": 0.3387, + "num_input_tokens_seen": 11882192, + "step": 18215 + }, + { + "epoch": 10.742924528301886, + "grad_norm": 3.0105676651000977, + "learning_rate": 5.224779429850344e-06, + "loss": 0.3297, + "num_input_tokens_seen": 11885040, + "step": 18220 + }, + { + "epoch": 10.745872641509434, + "grad_norm": 3.0587942600250244, + "learning_rate": 5.222209287181677e-06, + "loss": 0.3419, + "num_input_tokens_seen": 11888112, + "step": 18225 + }, + { + "epoch": 10.74882075471698, + "grad_norm": 1.7900869846343994, + "learning_rate": 5.219639085682142e-06, + "loss": 0.2961, + "num_input_tokens_seen": 11891440, + "step": 18230 + }, + { + "epoch": 10.751768867924529, + "grad_norm": 1.9678398370742798, + "learning_rate": 5.2170688260322124e-06, + "loss": 0.3792, + "num_input_tokens_seen": 11894224, + "step": 18235 + }, + { + "epoch": 10.754716981132075, + "grad_norm": 3.475299119949341, + "learning_rate": 5.214498508912376e-06, + "loss": 0.4167, + "num_input_tokens_seen": 11898384, + "step": 18240 + }, + { + "epoch": 10.757665094339622, + "grad_norm": 5.283947467803955, + "learning_rate": 5.211928135003135e-06, + "loss": 0.293, + "num_input_tokens_seen": 11901296, + "step": 18245 + }, + { + "epoch": 10.76061320754717, + "grad_norm": 2.512531042098999, + "learning_rate": 5.209357704985007e-06, + "loss": 0.4455, + "num_input_tokens_seen": 11904336, + "step": 18250 + }, + { + "epoch": 10.763561320754716, + "grad_norm": 3.9295547008514404, + "learning_rate": 5.206787219538524e-06, + "loss": 0.3935, + "num_input_tokens_seen": 11907760, + "step": 18255 + }, + { + "epoch": 10.766509433962264, + "grad_norm": 3.534097671508789, + "learning_rate": 5.204216679344234e-06, + "loss": 0.385, + "num_input_tokens_seen": 11910544, + "step": 18260 + }, + { + "epoch": 10.76945754716981, + "grad_norm": 3.6369435787200928, + "learning_rate": 5.201646085082696e-06, + "loss": 0.3224, + "num_input_tokens_seen": 11913200, + "step": 18265 + }, + { + "epoch": 10.772405660377359, + "grad_norm": 1.9674863815307617, + "learning_rate": 5.199075437434491e-06, + "loss": 0.3407, + "num_input_tokens_seen": 11915856, + "step": 18270 + }, + { + "epoch": 10.775353773584905, + "grad_norm": 3.5680670738220215, + "learning_rate": 5.1965047370802046e-06, + "loss": 0.4124, + "num_input_tokens_seen": 11919280, + "step": 18275 + }, + { + "epoch": 10.778301886792454, + "grad_norm": 3.434519052505493, + "learning_rate": 5.193933984700441e-06, + "loss": 0.4124, + "num_input_tokens_seen": 11922192, + "step": 18280 + }, + { + "epoch": 10.78125, + "grad_norm": 4.892963886260986, + "learning_rate": 5.1913631809758216e-06, + "loss": 0.3011, + "num_input_tokens_seen": 11925584, + "step": 18285 + }, + { + "epoch": 10.784198113207546, + "grad_norm": 2.604424476623535, + "learning_rate": 5.188792326586973e-06, + "loss": 0.4597, + "num_input_tokens_seen": 11928720, + "step": 18290 + }, + { + "epoch": 10.787146226415095, + "grad_norm": 3.5810248851776123, + "learning_rate": 5.186221422214544e-06, + "loss": 0.3354, + "num_input_tokens_seen": 11933232, + "step": 18295 + }, + { + "epoch": 10.790094339622641, + "grad_norm": 3.4061319828033447, + "learning_rate": 5.1836504685391885e-06, + "loss": 0.3631, + "num_input_tokens_seen": 11936208, + "step": 18300 + }, + { + "epoch": 10.79304245283019, + "grad_norm": 3.354257106781006, + "learning_rate": 5.181079466241582e-06, + "loss": 0.3286, + "num_input_tokens_seen": 11939504, + "step": 18305 + }, + { + "epoch": 10.795990566037736, + "grad_norm": 2.7241334915161133, + "learning_rate": 5.178508416002406e-06, + "loss": 0.4243, + "num_input_tokens_seen": 11941872, + "step": 18310 + }, + { + "epoch": 10.798938679245284, + "grad_norm": 0.5349394083023071, + "learning_rate": 5.175937318502357e-06, + "loss": 0.2514, + "num_input_tokens_seen": 11949072, + "step": 18315 + }, + { + "epoch": 10.80188679245283, + "grad_norm": 1.9773993492126465, + "learning_rate": 5.173366174422147e-06, + "loss": 0.3245, + "num_input_tokens_seen": 11952144, + "step": 18320 + }, + { + "epoch": 10.804834905660378, + "grad_norm": 2.267798900604248, + "learning_rate": 5.170794984442492e-06, + "loss": 0.6438, + "num_input_tokens_seen": 11954640, + "step": 18325 + }, + { + "epoch": 10.807783018867925, + "grad_norm": 1.7042869329452515, + "learning_rate": 5.16822374924413e-06, + "loss": 0.2986, + "num_input_tokens_seen": 11957968, + "step": 18330 + }, + { + "epoch": 10.810731132075471, + "grad_norm": 3.274555206298828, + "learning_rate": 5.165652469507806e-06, + "loss": 0.4318, + "num_input_tokens_seen": 11960944, + "step": 18335 + }, + { + "epoch": 10.81367924528302, + "grad_norm": 2.5985512733459473, + "learning_rate": 5.163081145914276e-06, + "loss": 0.3977, + "num_input_tokens_seen": 11964496, + "step": 18340 + }, + { + "epoch": 10.816627358490566, + "grad_norm": 2.8874995708465576, + "learning_rate": 5.160509779144311e-06, + "loss": 0.3632, + "num_input_tokens_seen": 11967152, + "step": 18345 + }, + { + "epoch": 10.819575471698114, + "grad_norm": 4.250257968902588, + "learning_rate": 5.157938369878688e-06, + "loss": 0.394, + "num_input_tokens_seen": 11969744, + "step": 18350 + }, + { + "epoch": 10.82252358490566, + "grad_norm": 2.335707426071167, + "learning_rate": 5.155366918798203e-06, + "loss": 0.2809, + "num_input_tokens_seen": 11972400, + "step": 18355 + }, + { + "epoch": 10.825471698113208, + "grad_norm": 1.4992411136627197, + "learning_rate": 5.152795426583654e-06, + "loss": 0.3309, + "num_input_tokens_seen": 11974992, + "step": 18360 + }, + { + "epoch": 10.828419811320755, + "grad_norm": 2.5281145572662354, + "learning_rate": 5.15022389391586e-06, + "loss": 0.3745, + "num_input_tokens_seen": 11978096, + "step": 18365 + }, + { + "epoch": 10.831367924528301, + "grad_norm": 3.674957275390625, + "learning_rate": 5.147652321475642e-06, + "loss": 0.5072, + "num_input_tokens_seen": 11981008, + "step": 18370 + }, + { + "epoch": 10.83431603773585, + "grad_norm": 1.9721482992172241, + "learning_rate": 5.145080709943835e-06, + "loss": 0.3936, + "num_input_tokens_seen": 11984336, + "step": 18375 + }, + { + "epoch": 10.837264150943396, + "grad_norm": 4.812885284423828, + "learning_rate": 5.142509060001285e-06, + "loss": 0.3093, + "num_input_tokens_seen": 11986736, + "step": 18380 + }, + { + "epoch": 10.840212264150944, + "grad_norm": 3.333235263824463, + "learning_rate": 5.139937372328847e-06, + "loss": 0.4739, + "num_input_tokens_seen": 11990608, + "step": 18385 + }, + { + "epoch": 10.84316037735849, + "grad_norm": 5.352551460266113, + "learning_rate": 5.1373656476073876e-06, + "loss": 0.3725, + "num_input_tokens_seen": 11993552, + "step": 18390 + }, + { + "epoch": 10.846108490566039, + "grad_norm": 4.58211088180542, + "learning_rate": 5.134793886517779e-06, + "loss": 0.3775, + "num_input_tokens_seen": 11995952, + "step": 18395 + }, + { + "epoch": 10.849056603773585, + "grad_norm": 2.18815016746521, + "learning_rate": 5.1322220897409105e-06, + "loss": 0.3981, + "num_input_tokens_seen": 11999376, + "step": 18400 + }, + { + "epoch": 10.852004716981131, + "grad_norm": 2.474853038787842, + "learning_rate": 5.129650257957671e-06, + "loss": 0.4274, + "num_input_tokens_seen": 12003312, + "step": 18405 + }, + { + "epoch": 10.85495283018868, + "grad_norm": 4.461069583892822, + "learning_rate": 5.12707839184897e-06, + "loss": 0.4039, + "num_input_tokens_seen": 12006416, + "step": 18410 + }, + { + "epoch": 10.857900943396226, + "grad_norm": 2.0852251052856445, + "learning_rate": 5.124506492095716e-06, + "loss": 0.3129, + "num_input_tokens_seen": 12010160, + "step": 18415 + }, + { + "epoch": 10.860849056603774, + "grad_norm": 1.6385436058044434, + "learning_rate": 5.121934559378831e-06, + "loss": 0.3494, + "num_input_tokens_seen": 12014256, + "step": 18420 + }, + { + "epoch": 10.86379716981132, + "grad_norm": 2.8635153770446777, + "learning_rate": 5.1193625943792456e-06, + "loss": 0.426, + "num_input_tokens_seen": 12017456, + "step": 18425 + }, + { + "epoch": 10.866745283018869, + "grad_norm": 2.319417953491211, + "learning_rate": 5.116790597777901e-06, + "loss": 0.3499, + "num_input_tokens_seen": 12020720, + "step": 18430 + }, + { + "epoch": 10.869693396226415, + "grad_norm": 1.647322177886963, + "learning_rate": 5.11421857025574e-06, + "loss": 0.2392, + "num_input_tokens_seen": 12024080, + "step": 18435 + }, + { + "epoch": 10.872641509433961, + "grad_norm": 5.10393762588501, + "learning_rate": 5.111646512493721e-06, + "loss": 0.3892, + "num_input_tokens_seen": 12026736, + "step": 18440 + }, + { + "epoch": 10.87558962264151, + "grad_norm": 1.9618160724639893, + "learning_rate": 5.109074425172806e-06, + "loss": 0.3227, + "num_input_tokens_seen": 12029296, + "step": 18445 + }, + { + "epoch": 10.878537735849056, + "grad_norm": 1.7240840196609497, + "learning_rate": 5.106502308973967e-06, + "loss": 0.3229, + "num_input_tokens_seen": 12032880, + "step": 18450 + }, + { + "epoch": 10.881485849056604, + "grad_norm": 2.490644693374634, + "learning_rate": 5.103930164578184e-06, + "loss": 0.4329, + "num_input_tokens_seen": 12036912, + "step": 18455 + }, + { + "epoch": 10.88443396226415, + "grad_norm": 2.762875556945801, + "learning_rate": 5.101357992666441e-06, + "loss": 0.3234, + "num_input_tokens_seen": 12039632, + "step": 18460 + }, + { + "epoch": 10.887382075471699, + "grad_norm": 2.833361864089966, + "learning_rate": 5.098785793919733e-06, + "loss": 0.422, + "num_input_tokens_seen": 12043632, + "step": 18465 + }, + { + "epoch": 10.890330188679245, + "grad_norm": 2.667381763458252, + "learning_rate": 5.096213569019061e-06, + "loss": 0.4758, + "num_input_tokens_seen": 12047824, + "step": 18470 + }, + { + "epoch": 10.893278301886792, + "grad_norm": 2.444791793823242, + "learning_rate": 5.0936413186454315e-06, + "loss": 0.2931, + "num_input_tokens_seen": 12050480, + "step": 18475 + }, + { + "epoch": 10.89622641509434, + "grad_norm": 2.135464906692505, + "learning_rate": 5.0910690434798584e-06, + "loss": 0.4547, + "num_input_tokens_seen": 12053680, + "step": 18480 + }, + { + "epoch": 10.899174528301886, + "grad_norm": 3.5650410652160645, + "learning_rate": 5.088496744203364e-06, + "loss": 0.3956, + "num_input_tokens_seen": 12056688, + "step": 18485 + }, + { + "epoch": 10.902122641509434, + "grad_norm": 2.479853391647339, + "learning_rate": 5.085924421496976e-06, + "loss": 0.3798, + "num_input_tokens_seen": 12060720, + "step": 18490 + }, + { + "epoch": 10.90507075471698, + "grad_norm": 3.08607816696167, + "learning_rate": 5.083352076041725e-06, + "loss": 0.2465, + "num_input_tokens_seen": 12063216, + "step": 18495 + }, + { + "epoch": 10.908018867924529, + "grad_norm": 3.8438076972961426, + "learning_rate": 5.080779708518654e-06, + "loss": 0.3978, + "num_input_tokens_seen": 12066800, + "step": 18500 + }, + { + "epoch": 10.910966981132075, + "grad_norm": 2.1845197677612305, + "learning_rate": 5.078207319608807e-06, + "loss": 0.3492, + "num_input_tokens_seen": 12070320, + "step": 18505 + }, + { + "epoch": 10.913915094339622, + "grad_norm": 2.693627119064331, + "learning_rate": 5.075634909993235e-06, + "loss": 0.3543, + "num_input_tokens_seen": 12073296, + "step": 18510 + }, + { + "epoch": 10.91686320754717, + "grad_norm": 3.956373453140259, + "learning_rate": 5.073062480352995e-06, + "loss": 0.336, + "num_input_tokens_seen": 12075472, + "step": 18515 + }, + { + "epoch": 10.919811320754716, + "grad_norm": 8.80846118927002, + "learning_rate": 5.070490031369149e-06, + "loss": 0.395, + "num_input_tokens_seen": 12078384, + "step": 18520 + }, + { + "epoch": 10.922759433962264, + "grad_norm": 2.1584599018096924, + "learning_rate": 5.067917563722762e-06, + "loss": 0.3274, + "num_input_tokens_seen": 12081808, + "step": 18525 + }, + { + "epoch": 10.92570754716981, + "grad_norm": 2.879453659057617, + "learning_rate": 5.065345078094907e-06, + "loss": 0.4302, + "num_input_tokens_seen": 12085200, + "step": 18530 + }, + { + "epoch": 10.928655660377359, + "grad_norm": 1.6405314207077026, + "learning_rate": 5.062772575166663e-06, + "loss": 0.3497, + "num_input_tokens_seen": 12087888, + "step": 18535 + }, + { + "epoch": 10.931603773584905, + "grad_norm": 3.7632956504821777, + "learning_rate": 5.0602000556191075e-06, + "loss": 0.2891, + "num_input_tokens_seen": 12090608, + "step": 18540 + }, + { + "epoch": 10.934551886792454, + "grad_norm": 2.590761661529541, + "learning_rate": 5.0576275201333284e-06, + "loss": 0.2786, + "num_input_tokens_seen": 12094960, + "step": 18545 + }, + { + "epoch": 10.9375, + "grad_norm": 3.6195850372314453, + "learning_rate": 5.055054969390415e-06, + "loss": 0.3768, + "num_input_tokens_seen": 12098352, + "step": 18550 + }, + { + "epoch": 10.940448113207546, + "grad_norm": 3.2892050743103027, + "learning_rate": 5.052482404071461e-06, + "loss": 0.4303, + "num_input_tokens_seen": 12101840, + "step": 18555 + }, + { + "epoch": 10.943396226415095, + "grad_norm": 3.4126620292663574, + "learning_rate": 5.049909824857564e-06, + "loss": 0.4604, + "num_input_tokens_seen": 12106256, + "step": 18560 + }, + { + "epoch": 10.946344339622641, + "grad_norm": 2.6609344482421875, + "learning_rate": 5.047337232429827e-06, + "loss": 0.4776, + "num_input_tokens_seen": 12108528, + "step": 18565 + }, + { + "epoch": 10.94929245283019, + "grad_norm": 2.55163836479187, + "learning_rate": 5.044764627469354e-06, + "loss": 0.3411, + "num_input_tokens_seen": 12111920, + "step": 18570 + }, + { + "epoch": 10.952240566037736, + "grad_norm": 2.01534366607666, + "learning_rate": 5.042192010657251e-06, + "loss": 0.3128, + "num_input_tokens_seen": 12115024, + "step": 18575 + }, + { + "epoch": 10.955188679245284, + "grad_norm": 2.3214597702026367, + "learning_rate": 5.039619382674632e-06, + "loss": 0.3344, + "num_input_tokens_seen": 12118096, + "step": 18580 + }, + { + "epoch": 10.95813679245283, + "grad_norm": 3.8189620971679688, + "learning_rate": 5.0370467442026115e-06, + "loss": 0.3442, + "num_input_tokens_seen": 12122224, + "step": 18585 + }, + { + "epoch": 10.961084905660378, + "grad_norm": 2.1392595767974854, + "learning_rate": 5.034474095922304e-06, + "loss": 0.3679, + "num_input_tokens_seen": 12125552, + "step": 18590 + }, + { + "epoch": 10.964033018867925, + "grad_norm": 3.241791009902954, + "learning_rate": 5.031901438514832e-06, + "loss": 0.2092, + "num_input_tokens_seen": 12129232, + "step": 18595 + }, + { + "epoch": 10.966981132075471, + "grad_norm": 2.5336267948150635, + "learning_rate": 5.0293287726613185e-06, + "loss": 0.4918, + "num_input_tokens_seen": 12132176, + "step": 18600 + }, + { + "epoch": 10.96992924528302, + "grad_norm": 2.9993631839752197, + "learning_rate": 5.0267560990428836e-06, + "loss": 0.4941, + "num_input_tokens_seen": 12135600, + "step": 18605 + }, + { + "epoch": 10.972877358490566, + "grad_norm": 4.378379821777344, + "learning_rate": 5.024183418340657e-06, + "loss": 0.3432, + "num_input_tokens_seen": 12139088, + "step": 18610 + }, + { + "epoch": 10.975825471698114, + "grad_norm": 2.6746349334716797, + "learning_rate": 5.021610731235766e-06, + "loss": 0.2516, + "num_input_tokens_seen": 12143024, + "step": 18615 + }, + { + "epoch": 10.97877358490566, + "grad_norm": 3.42079758644104, + "learning_rate": 5.01903803840934e-06, + "loss": 0.3362, + "num_input_tokens_seen": 12146640, + "step": 18620 + }, + { + "epoch": 10.981721698113208, + "grad_norm": 2.9208247661590576, + "learning_rate": 5.016465340542514e-06, + "loss": 0.3411, + "num_input_tokens_seen": 12150704, + "step": 18625 + }, + { + "epoch": 10.984669811320755, + "grad_norm": 3.3420186042785645, + "learning_rate": 5.013892638316417e-06, + "loss": 0.3694, + "num_input_tokens_seen": 12155920, + "step": 18630 + }, + { + "epoch": 10.987617924528301, + "grad_norm": 2.7794246673583984, + "learning_rate": 5.011319932412182e-06, + "loss": 0.377, + "num_input_tokens_seen": 12158352, + "step": 18635 + }, + { + "epoch": 10.99056603773585, + "grad_norm": 1.4129530191421509, + "learning_rate": 5.008747223510947e-06, + "loss": 0.2736, + "num_input_tokens_seen": 12162512, + "step": 18640 + }, + { + "epoch": 10.993514150943396, + "grad_norm": 7.344161510467529, + "learning_rate": 5.006174512293849e-06, + "loss": 0.3488, + "num_input_tokens_seen": 12165296, + "step": 18645 + }, + { + "epoch": 10.996462264150944, + "grad_norm": 5.310723781585693, + "learning_rate": 5.003601799442019e-06, + "loss": 0.3677, + "num_input_tokens_seen": 12168720, + "step": 18650 + }, + { + "epoch": 10.99941037735849, + "grad_norm": 2.111420154571533, + "learning_rate": 5.0010290856366e-06, + "loss": 0.4537, + "num_input_tokens_seen": 12171856, + "step": 18655 + }, + { + "epoch": 11.002358490566039, + "grad_norm": 2.004443645477295, + "learning_rate": 4.998456371558726e-06, + "loss": 0.2664, + "num_input_tokens_seen": 12174800, + "step": 18660 + }, + { + "epoch": 11.005306603773585, + "grad_norm": 2.8950324058532715, + "learning_rate": 4.995883657889531e-06, + "loss": 0.4325, + "num_input_tokens_seen": 12178544, + "step": 18665 + }, + { + "epoch": 11.008254716981131, + "grad_norm": 2.3139302730560303, + "learning_rate": 4.993310945310158e-06, + "loss": 0.3309, + "num_input_tokens_seen": 12182192, + "step": 18670 + }, + { + "epoch": 11.01120283018868, + "grad_norm": 1.9467076063156128, + "learning_rate": 4.9907382345017416e-06, + "loss": 0.3315, + "num_input_tokens_seen": 12185552, + "step": 18675 + }, + { + "epoch": 11.014150943396226, + "grad_norm": 2.683283805847168, + "learning_rate": 4.988165526145416e-06, + "loss": 0.4825, + "num_input_tokens_seen": 12188176, + "step": 18680 + }, + { + "epoch": 11.017099056603774, + "grad_norm": 2.354342460632324, + "learning_rate": 4.985592820922319e-06, + "loss": 0.29, + "num_input_tokens_seen": 12191152, + "step": 18685 + }, + { + "epoch": 11.02004716981132, + "grad_norm": 5.716548442840576, + "learning_rate": 4.983020119513586e-06, + "loss": 0.3524, + "num_input_tokens_seen": 12194448, + "step": 18690 + }, + { + "epoch": 11.022995283018869, + "grad_norm": 2.2911159992218018, + "learning_rate": 4.9804474226003465e-06, + "loss": 0.3198, + "num_input_tokens_seen": 12197776, + "step": 18695 + }, + { + "epoch": 11.025943396226415, + "grad_norm": 3.390439748764038, + "learning_rate": 4.977874730863739e-06, + "loss": 0.342, + "num_input_tokens_seen": 12201424, + "step": 18700 + }, + { + "epoch": 11.028891509433961, + "grad_norm": 2.525070905685425, + "learning_rate": 4.975302044984889e-06, + "loss": 0.3029, + "num_input_tokens_seen": 12204016, + "step": 18705 + }, + { + "epoch": 11.03183962264151, + "grad_norm": 2.57283091545105, + "learning_rate": 4.972729365644931e-06, + "loss": 0.3925, + "num_input_tokens_seen": 12207120, + "step": 18710 + }, + { + "epoch": 11.034787735849056, + "grad_norm": 2.4361491203308105, + "learning_rate": 4.97015669352499e-06, + "loss": 0.5008, + "num_input_tokens_seen": 12210992, + "step": 18715 + }, + { + "epoch": 11.037735849056604, + "grad_norm": 2.788032293319702, + "learning_rate": 4.967584029306194e-06, + "loss": 0.3703, + "num_input_tokens_seen": 12215184, + "step": 18720 + }, + { + "epoch": 11.04068396226415, + "grad_norm": 2.1505331993103027, + "learning_rate": 4.965011373669666e-06, + "loss": 0.3677, + "num_input_tokens_seen": 12218928, + "step": 18725 + }, + { + "epoch": 11.043632075471699, + "grad_norm": 2.590747356414795, + "learning_rate": 4.962438727296527e-06, + "loss": 0.3451, + "num_input_tokens_seen": 12222032, + "step": 18730 + }, + { + "epoch": 11.046580188679245, + "grad_norm": 3.482480764389038, + "learning_rate": 4.959866090867897e-06, + "loss": 0.4006, + "num_input_tokens_seen": 12225104, + "step": 18735 + }, + { + "epoch": 11.049528301886792, + "grad_norm": 2.307365894317627, + "learning_rate": 4.957293465064893e-06, + "loss": 0.3898, + "num_input_tokens_seen": 12227824, + "step": 18740 + }, + { + "epoch": 11.05247641509434, + "grad_norm": 5.037140369415283, + "learning_rate": 4.954720850568627e-06, + "loss": 0.3607, + "num_input_tokens_seen": 12230448, + "step": 18745 + }, + { + "epoch": 11.055424528301886, + "grad_norm": 4.02696418762207, + "learning_rate": 4.952148248060212e-06, + "loss": 0.3053, + "num_input_tokens_seen": 12233776, + "step": 18750 + }, + { + "epoch": 11.058372641509434, + "grad_norm": 3.7996461391448975, + "learning_rate": 4.949575658220755e-06, + "loss": 0.2564, + "num_input_tokens_seen": 12236240, + "step": 18755 + }, + { + "epoch": 11.06132075471698, + "grad_norm": 4.318322658538818, + "learning_rate": 4.947003081731359e-06, + "loss": 0.4753, + "num_input_tokens_seen": 12239504, + "step": 18760 + }, + { + "epoch": 11.064268867924529, + "grad_norm": 2.4440433979034424, + "learning_rate": 4.944430519273126e-06, + "loss": 0.3391, + "num_input_tokens_seen": 12242416, + "step": 18765 + }, + { + "epoch": 11.067216981132075, + "grad_norm": 3.1278347969055176, + "learning_rate": 4.941857971527152e-06, + "loss": 0.4231, + "num_input_tokens_seen": 12246192, + "step": 18770 + }, + { + "epoch": 11.070165094339623, + "grad_norm": 2.9099042415618896, + "learning_rate": 4.93928543917453e-06, + "loss": 0.3938, + "num_input_tokens_seen": 12250096, + "step": 18775 + }, + { + "epoch": 11.07311320754717, + "grad_norm": 3.319836378097534, + "learning_rate": 4.93671292289635e-06, + "loss": 0.4379, + "num_input_tokens_seen": 12254032, + "step": 18780 + }, + { + "epoch": 11.076061320754716, + "grad_norm": 2.8216004371643066, + "learning_rate": 4.934140423373698e-06, + "loss": 0.3173, + "num_input_tokens_seen": 12257424, + "step": 18785 + }, + { + "epoch": 11.079009433962264, + "grad_norm": 2.5357778072357178, + "learning_rate": 4.931567941287651e-06, + "loss": 0.263, + "num_input_tokens_seen": 12260944, + "step": 18790 + }, + { + "epoch": 11.08195754716981, + "grad_norm": 4.195082664489746, + "learning_rate": 4.9289954773192875e-06, + "loss": 0.3273, + "num_input_tokens_seen": 12263056, + "step": 18795 + }, + { + "epoch": 11.084905660377359, + "grad_norm": 4.919189453125, + "learning_rate": 4.926423032149677e-06, + "loss": 0.2669, + "num_input_tokens_seen": 12266480, + "step": 18800 + }, + { + "epoch": 11.087853773584905, + "grad_norm": 2.735436201095581, + "learning_rate": 4.923850606459883e-06, + "loss": 0.3957, + "num_input_tokens_seen": 12269328, + "step": 18805 + }, + { + "epoch": 11.090801886792454, + "grad_norm": 2.6453256607055664, + "learning_rate": 4.921278200930972e-06, + "loss": 0.2954, + "num_input_tokens_seen": 12271408, + "step": 18810 + }, + { + "epoch": 11.09375, + "grad_norm": 3.1545779705047607, + "learning_rate": 4.918705816243996e-06, + "loss": 0.4298, + "num_input_tokens_seen": 12274384, + "step": 18815 + }, + { + "epoch": 11.096698113207546, + "grad_norm": 5.975141525268555, + "learning_rate": 4.916133453080007e-06, + "loss": 0.3106, + "num_input_tokens_seen": 12277648, + "step": 18820 + }, + { + "epoch": 11.099646226415095, + "grad_norm": 2.9268081188201904, + "learning_rate": 4.913561112120046e-06, + "loss": 0.4145, + "num_input_tokens_seen": 12282096, + "step": 18825 + }, + { + "epoch": 11.102594339622641, + "grad_norm": 3.140249729156494, + "learning_rate": 4.910988794045154e-06, + "loss": 0.4101, + "num_input_tokens_seen": 12284912, + "step": 18830 + }, + { + "epoch": 11.10554245283019, + "grad_norm": 1.6862032413482666, + "learning_rate": 4.9084164995363626e-06, + "loss": 0.3242, + "num_input_tokens_seen": 12287824, + "step": 18835 + }, + { + "epoch": 11.108490566037736, + "grad_norm": 3.221494197845459, + "learning_rate": 4.905844229274697e-06, + "loss": 0.4541, + "num_input_tokens_seen": 12291152, + "step": 18840 + }, + { + "epoch": 11.111438679245284, + "grad_norm": 2.7325901985168457, + "learning_rate": 4.903271983941177e-06, + "loss": 0.3456, + "num_input_tokens_seen": 12293904, + "step": 18845 + }, + { + "epoch": 11.11438679245283, + "grad_norm": 3.1134402751922607, + "learning_rate": 4.900699764216818e-06, + "loss": 0.3313, + "num_input_tokens_seen": 12296432, + "step": 18850 + }, + { + "epoch": 11.117334905660377, + "grad_norm": 2.9292571544647217, + "learning_rate": 4.898127570782622e-06, + "loss": 0.3298, + "num_input_tokens_seen": 12299376, + "step": 18855 + }, + { + "epoch": 11.120283018867925, + "grad_norm": 2.5798046588897705, + "learning_rate": 4.895555404319592e-06, + "loss": 0.3568, + "num_input_tokens_seen": 12301712, + "step": 18860 + }, + { + "epoch": 11.123231132075471, + "grad_norm": 7.139198303222656, + "learning_rate": 4.89298326550872e-06, + "loss": 0.4002, + "num_input_tokens_seen": 12304496, + "step": 18865 + }, + { + "epoch": 11.12617924528302, + "grad_norm": 5.531651496887207, + "learning_rate": 4.8904111550309876e-06, + "loss": 0.4819, + "num_input_tokens_seen": 12307344, + "step": 18870 + }, + { + "epoch": 11.129127358490566, + "grad_norm": 3.0349009037017822, + "learning_rate": 4.8878390735673755e-06, + "loss": 0.3284, + "num_input_tokens_seen": 12310576, + "step": 18875 + }, + { + "epoch": 11.132075471698114, + "grad_norm": 6.8454742431640625, + "learning_rate": 4.8852670217988505e-06, + "loss": 0.324, + "num_input_tokens_seen": 12315120, + "step": 18880 + }, + { + "epoch": 11.13502358490566, + "grad_norm": 1.9079841375350952, + "learning_rate": 4.882695000406377e-06, + "loss": 0.3037, + "num_input_tokens_seen": 12318320, + "step": 18885 + }, + { + "epoch": 11.137971698113208, + "grad_norm": 2.1093997955322266, + "learning_rate": 4.880123010070909e-06, + "loss": 0.344, + "num_input_tokens_seen": 12321328, + "step": 18890 + }, + { + "epoch": 11.140919811320755, + "grad_norm": 5.495284557342529, + "learning_rate": 4.877551051473388e-06, + "loss": 0.2915, + "num_input_tokens_seen": 12324432, + "step": 18895 + }, + { + "epoch": 11.143867924528301, + "grad_norm": 2.76992130279541, + "learning_rate": 4.874979125294755e-06, + "loss": 0.4132, + "num_input_tokens_seen": 12327632, + "step": 18900 + }, + { + "epoch": 11.14681603773585, + "grad_norm": 3.3313262462615967, + "learning_rate": 4.872407232215937e-06, + "loss": 0.4083, + "num_input_tokens_seen": 12330544, + "step": 18905 + }, + { + "epoch": 11.149764150943396, + "grad_norm": 2.287444591522217, + "learning_rate": 4.8698353729178546e-06, + "loss": 0.3859, + "num_input_tokens_seen": 12333968, + "step": 18910 + }, + { + "epoch": 11.152712264150944, + "grad_norm": 3.676532030105591, + "learning_rate": 4.867263548081418e-06, + "loss": 0.3358, + "num_input_tokens_seen": 12337104, + "step": 18915 + }, + { + "epoch": 11.15566037735849, + "grad_norm": 2.8962314128875732, + "learning_rate": 4.8646917583875304e-06, + "loss": 0.236, + "num_input_tokens_seen": 12340656, + "step": 18920 + }, + { + "epoch": 11.158608490566039, + "grad_norm": 2.3854353427886963, + "learning_rate": 4.862120004517082e-06, + "loss": 0.4384, + "num_input_tokens_seen": 12344304, + "step": 18925 + }, + { + "epoch": 11.161556603773585, + "grad_norm": 2.607245683670044, + "learning_rate": 4.859548287150956e-06, + "loss": 0.294, + "num_input_tokens_seen": 12350800, + "step": 18930 + }, + { + "epoch": 11.164504716981131, + "grad_norm": 5.347283840179443, + "learning_rate": 4.8569766069700275e-06, + "loss": 0.3419, + "num_input_tokens_seen": 12354096, + "step": 18935 + }, + { + "epoch": 11.16745283018868, + "grad_norm": 1.8244376182556152, + "learning_rate": 4.854404964655158e-06, + "loss": 0.3323, + "num_input_tokens_seen": 12357520, + "step": 18940 + }, + { + "epoch": 11.170400943396226, + "grad_norm": 2.336599826812744, + "learning_rate": 4.8518333608872015e-06, + "loss": 0.2989, + "num_input_tokens_seen": 12360592, + "step": 18945 + }, + { + "epoch": 11.173349056603774, + "grad_norm": 2.4436724185943604, + "learning_rate": 4.849261796347002e-06, + "loss": 0.4278, + "num_input_tokens_seen": 12363120, + "step": 18950 + }, + { + "epoch": 11.17629716981132, + "grad_norm": 2.5173592567443848, + "learning_rate": 4.846690271715391e-06, + "loss": 0.3995, + "num_input_tokens_seen": 12366896, + "step": 18955 + }, + { + "epoch": 11.179245283018869, + "grad_norm": 1.7357096672058105, + "learning_rate": 4.844118787673191e-06, + "loss": 0.4895, + "num_input_tokens_seen": 12370096, + "step": 18960 + }, + { + "epoch": 11.182193396226415, + "grad_norm": 1.5885047912597656, + "learning_rate": 4.841547344901214e-06, + "loss": 0.2992, + "num_input_tokens_seen": 12373808, + "step": 18965 + }, + { + "epoch": 11.185141509433961, + "grad_norm": 1.6408543586730957, + "learning_rate": 4.838975944080261e-06, + "loss": 0.3822, + "num_input_tokens_seen": 12378064, + "step": 18970 + }, + { + "epoch": 11.18808962264151, + "grad_norm": 2.300551414489746, + "learning_rate": 4.83640458589112e-06, + "loss": 0.3382, + "num_input_tokens_seen": 12381744, + "step": 18975 + }, + { + "epoch": 11.191037735849056, + "grad_norm": 2.3883817195892334, + "learning_rate": 4.833833271014571e-06, + "loss": 0.3703, + "num_input_tokens_seen": 12384656, + "step": 18980 + }, + { + "epoch": 11.193985849056604, + "grad_norm": 2.3296713829040527, + "learning_rate": 4.831262000131379e-06, + "loss": 0.3804, + "num_input_tokens_seen": 12388336, + "step": 18985 + }, + { + "epoch": 11.19693396226415, + "grad_norm": 2.4810731410980225, + "learning_rate": 4.828690773922299e-06, + "loss": 0.2396, + "num_input_tokens_seen": 12390704, + "step": 18990 + }, + { + "epoch": 11.199882075471699, + "grad_norm": 2.139838457107544, + "learning_rate": 4.826119593068074e-06, + "loss": 0.238, + "num_input_tokens_seen": 12395024, + "step": 18995 + }, + { + "epoch": 11.202830188679245, + "grad_norm": 5.266247749328613, + "learning_rate": 4.8235484582494375e-06, + "loss": 0.3546, + "num_input_tokens_seen": 12398192, + "step": 19000 + }, + { + "epoch": 11.205778301886792, + "grad_norm": 3.186474561691284, + "learning_rate": 4.8209773701471076e-06, + "loss": 0.6617, + "num_input_tokens_seen": 12401392, + "step": 19005 + }, + { + "epoch": 11.20872641509434, + "grad_norm": 3.941420555114746, + "learning_rate": 4.818406329441789e-06, + "loss": 0.5045, + "num_input_tokens_seen": 12404912, + "step": 19010 + }, + { + "epoch": 11.211674528301886, + "grad_norm": 3.098647356033325, + "learning_rate": 4.815835336814179e-06, + "loss": 0.4558, + "num_input_tokens_seen": 12411408, + "step": 19015 + }, + { + "epoch": 11.214622641509434, + "grad_norm": 4.1012349128723145, + "learning_rate": 4.813264392944957e-06, + "loss": 0.3711, + "num_input_tokens_seen": 12414480, + "step": 19020 + }, + { + "epoch": 11.21757075471698, + "grad_norm": 2.6398797035217285, + "learning_rate": 4.8106934985147905e-06, + "loss": 0.285, + "num_input_tokens_seen": 12418160, + "step": 19025 + }, + { + "epoch": 11.220518867924529, + "grad_norm": 2.8535752296447754, + "learning_rate": 4.808122654204338e-06, + "loss": 0.3588, + "num_input_tokens_seen": 12421360, + "step": 19030 + }, + { + "epoch": 11.223466981132075, + "grad_norm": 6.178938865661621, + "learning_rate": 4.805551860694239e-06, + "loss": 0.4009, + "num_input_tokens_seen": 12423888, + "step": 19035 + }, + { + "epoch": 11.226415094339623, + "grad_norm": 1.7846553325653076, + "learning_rate": 4.802981118665122e-06, + "loss": 0.3855, + "num_input_tokens_seen": 12428208, + "step": 19040 + }, + { + "epoch": 11.22936320754717, + "grad_norm": 3.182849884033203, + "learning_rate": 4.800410428797604e-06, + "loss": 0.4328, + "num_input_tokens_seen": 12431568, + "step": 19045 + }, + { + "epoch": 11.232311320754716, + "grad_norm": 2.6362509727478027, + "learning_rate": 4.797839791772286e-06, + "loss": 0.3515, + "num_input_tokens_seen": 12434384, + "step": 19050 + }, + { + "epoch": 11.235259433962264, + "grad_norm": 5.0005693435668945, + "learning_rate": 4.795269208269752e-06, + "loss": 0.3832, + "num_input_tokens_seen": 12436784, + "step": 19055 + }, + { + "epoch": 11.23820754716981, + "grad_norm": 2.8635447025299072, + "learning_rate": 4.792698678970579e-06, + "loss": 0.3189, + "num_input_tokens_seen": 12440592, + "step": 19060 + }, + { + "epoch": 11.241155660377359, + "grad_norm": 3.8279361724853516, + "learning_rate": 4.7901282045553245e-06, + "loss": 0.3805, + "num_input_tokens_seen": 12443344, + "step": 19065 + }, + { + "epoch": 11.244103773584905, + "grad_norm": 2.2663140296936035, + "learning_rate": 4.787557785704531e-06, + "loss": 0.4064, + "num_input_tokens_seen": 12446736, + "step": 19070 + }, + { + "epoch": 11.247051886792454, + "grad_norm": 2.024879217147827, + "learning_rate": 4.784987423098731e-06, + "loss": 0.3028, + "num_input_tokens_seen": 12450128, + "step": 19075 + }, + { + "epoch": 11.25, + "grad_norm": 3.1784489154815674, + "learning_rate": 4.7824171174184354e-06, + "loss": 0.3516, + "num_input_tokens_seen": 12452976, + "step": 19080 + }, + { + "epoch": 11.252948113207546, + "grad_norm": 4.035409927368164, + "learning_rate": 4.779846869344146e-06, + "loss": 0.3235, + "num_input_tokens_seen": 12455984, + "step": 19085 + }, + { + "epoch": 11.255896226415095, + "grad_norm": 2.4025628566741943, + "learning_rate": 4.777276679556346e-06, + "loss": 0.2961, + "num_input_tokens_seen": 12459088, + "step": 19090 + }, + { + "epoch": 11.258844339622641, + "grad_norm": 1.878843903541565, + "learning_rate": 4.774706548735507e-06, + "loss": 0.3704, + "num_input_tokens_seen": 12461712, + "step": 19095 + }, + { + "epoch": 11.26179245283019, + "grad_norm": 2.100754737854004, + "learning_rate": 4.77213647756208e-06, + "loss": 0.3257, + "num_input_tokens_seen": 12465072, + "step": 19100 + }, + { + "epoch": 11.264740566037736, + "grad_norm": 3.777226448059082, + "learning_rate": 4.769566466716501e-06, + "loss": 0.3267, + "num_input_tokens_seen": 12467984, + "step": 19105 + }, + { + "epoch": 11.267688679245284, + "grad_norm": 2.3994710445404053, + "learning_rate": 4.766996516879195e-06, + "loss": 0.5089, + "num_input_tokens_seen": 12471280, + "step": 19110 + }, + { + "epoch": 11.27063679245283, + "grad_norm": 2.4616613388061523, + "learning_rate": 4.764426628730564e-06, + "loss": 0.4872, + "num_input_tokens_seen": 12474736, + "step": 19115 + }, + { + "epoch": 11.273584905660377, + "grad_norm": 2.025953769683838, + "learning_rate": 4.761856802950999e-06, + "loss": 0.2715, + "num_input_tokens_seen": 12478064, + "step": 19120 + }, + { + "epoch": 11.276533018867925, + "grad_norm": 3.4830243587493896, + "learning_rate": 4.759287040220872e-06, + "loss": 0.5084, + "num_input_tokens_seen": 12480944, + "step": 19125 + }, + { + "epoch": 11.279481132075471, + "grad_norm": 4.849188804626465, + "learning_rate": 4.756717341220538e-06, + "loss": 0.2879, + "num_input_tokens_seen": 12483728, + "step": 19130 + }, + { + "epoch": 11.28242924528302, + "grad_norm": 2.362220048904419, + "learning_rate": 4.7541477066303365e-06, + "loss": 0.4629, + "num_input_tokens_seen": 12486160, + "step": 19135 + }, + { + "epoch": 11.285377358490566, + "grad_norm": 4.428398132324219, + "learning_rate": 4.75157813713059e-06, + "loss": 0.3607, + "num_input_tokens_seen": 12489744, + "step": 19140 + }, + { + "epoch": 11.288325471698114, + "grad_norm": 6.311544895172119, + "learning_rate": 4.7490086334016e-06, + "loss": 0.3728, + "num_input_tokens_seen": 12492304, + "step": 19145 + }, + { + "epoch": 11.29127358490566, + "grad_norm": 5.3923115730285645, + "learning_rate": 4.746439196123659e-06, + "loss": 0.319, + "num_input_tokens_seen": 12496144, + "step": 19150 + }, + { + "epoch": 11.294221698113208, + "grad_norm": 6.6790947914123535, + "learning_rate": 4.743869825977032e-06, + "loss": 0.3354, + "num_input_tokens_seen": 12499152, + "step": 19155 + }, + { + "epoch": 11.297169811320755, + "grad_norm": 4.919562816619873, + "learning_rate": 4.741300523641972e-06, + "loss": 0.3769, + "num_input_tokens_seen": 12501168, + "step": 19160 + }, + { + "epoch": 11.300117924528301, + "grad_norm": 3.749565362930298, + "learning_rate": 4.738731289798715e-06, + "loss": 0.3989, + "num_input_tokens_seen": 12504240, + "step": 19165 + }, + { + "epoch": 11.30306603773585, + "grad_norm": 3.7977521419525146, + "learning_rate": 4.7361621251274744e-06, + "loss": 0.4335, + "num_input_tokens_seen": 12507952, + "step": 19170 + }, + { + "epoch": 11.306014150943396, + "grad_norm": 3.0528500080108643, + "learning_rate": 4.733593030308446e-06, + "loss": 0.4033, + "num_input_tokens_seen": 12511536, + "step": 19175 + }, + { + "epoch": 11.308962264150944, + "grad_norm": 2.1469242572784424, + "learning_rate": 4.731024006021814e-06, + "loss": 0.3032, + "num_input_tokens_seen": 12514256, + "step": 19180 + }, + { + "epoch": 11.31191037735849, + "grad_norm": 2.782423496246338, + "learning_rate": 4.728455052947732e-06, + "loss": 0.332, + "num_input_tokens_seen": 12517936, + "step": 19185 + }, + { + "epoch": 11.314858490566039, + "grad_norm": 5.1573076248168945, + "learning_rate": 4.725886171766349e-06, + "loss": 0.3474, + "num_input_tokens_seen": 12520784, + "step": 19190 + }, + { + "epoch": 11.317806603773585, + "grad_norm": 3.7818949222564697, + "learning_rate": 4.723317363157781e-06, + "loss": 0.2432, + "num_input_tokens_seen": 12524016, + "step": 19195 + }, + { + "epoch": 11.320754716981131, + "grad_norm": 2.8232312202453613, + "learning_rate": 4.720748627802135e-06, + "loss": 0.3114, + "num_input_tokens_seen": 12526960, + "step": 19200 + }, + { + "epoch": 11.32370283018868, + "grad_norm": 2.4666051864624023, + "learning_rate": 4.718179966379492e-06, + "loss": 0.3086, + "num_input_tokens_seen": 12530736, + "step": 19205 + }, + { + "epoch": 11.326650943396226, + "grad_norm": 4.753872871398926, + "learning_rate": 4.715611379569919e-06, + "loss": 0.299, + "num_input_tokens_seen": 12533520, + "step": 19210 + }, + { + "epoch": 11.329599056603774, + "grad_norm": 4.082564353942871, + "learning_rate": 4.713042868053458e-06, + "loss": 0.3886, + "num_input_tokens_seen": 12536656, + "step": 19215 + }, + { + "epoch": 11.33254716981132, + "grad_norm": 2.0688021183013916, + "learning_rate": 4.7104744325101345e-06, + "loss": 0.4137, + "num_input_tokens_seen": 12540112, + "step": 19220 + }, + { + "epoch": 11.335495283018869, + "grad_norm": 4.766937732696533, + "learning_rate": 4.7079060736199525e-06, + "loss": 0.2594, + "num_input_tokens_seen": 12542640, + "step": 19225 + }, + { + "epoch": 11.338443396226415, + "grad_norm": 2.9978954792022705, + "learning_rate": 4.705337792062897e-06, + "loss": 0.3725, + "num_input_tokens_seen": 12545808, + "step": 19230 + }, + { + "epoch": 11.341391509433961, + "grad_norm": 3.5566420555114746, + "learning_rate": 4.702769588518931e-06, + "loss": 0.4459, + "num_input_tokens_seen": 12548528, + "step": 19235 + }, + { + "epoch": 11.34433962264151, + "grad_norm": 2.732017755508423, + "learning_rate": 4.700201463667996e-06, + "loss": 0.427, + "num_input_tokens_seen": 12551792, + "step": 19240 + }, + { + "epoch": 11.347287735849056, + "grad_norm": 4.120290279388428, + "learning_rate": 4.697633418190017e-06, + "loss": 0.4721, + "num_input_tokens_seen": 12555184, + "step": 19245 + }, + { + "epoch": 11.350235849056604, + "grad_norm": 3.1967711448669434, + "learning_rate": 4.695065452764893e-06, + "loss": 0.4315, + "num_input_tokens_seen": 12558160, + "step": 19250 + }, + { + "epoch": 11.35318396226415, + "grad_norm": 3.2372758388519287, + "learning_rate": 4.692497568072505e-06, + "loss": 0.333, + "num_input_tokens_seen": 12562000, + "step": 19255 + }, + { + "epoch": 11.356132075471699, + "grad_norm": 3.3249053955078125, + "learning_rate": 4.689929764792711e-06, + "loss": 0.3627, + "num_input_tokens_seen": 12564720, + "step": 19260 + }, + { + "epoch": 11.359080188679245, + "grad_norm": 2.384399890899658, + "learning_rate": 4.687362043605349e-06, + "loss": 0.385, + "num_input_tokens_seen": 12568208, + "step": 19265 + }, + { + "epoch": 11.362028301886792, + "grad_norm": 2.355854034423828, + "learning_rate": 4.684794405190231e-06, + "loss": 0.3423, + "num_input_tokens_seen": 12571152, + "step": 19270 + }, + { + "epoch": 11.36497641509434, + "grad_norm": 3.4837660789489746, + "learning_rate": 4.682226850227155e-06, + "loss": 0.3624, + "num_input_tokens_seen": 12574896, + "step": 19275 + }, + { + "epoch": 11.367924528301886, + "grad_norm": 2.0178909301757812, + "learning_rate": 4.6796593793958875e-06, + "loss": 0.3234, + "num_input_tokens_seen": 12578384, + "step": 19280 + }, + { + "epoch": 11.370872641509434, + "grad_norm": 3.545686721801758, + "learning_rate": 4.6770919933761815e-06, + "loss": 0.337, + "num_input_tokens_seen": 12581168, + "step": 19285 + }, + { + "epoch": 11.37382075471698, + "grad_norm": 2.779470920562744, + "learning_rate": 4.6745246928477615e-06, + "loss": 0.4259, + "num_input_tokens_seen": 12584272, + "step": 19290 + }, + { + "epoch": 11.376768867924529, + "grad_norm": 3.018712282180786, + "learning_rate": 4.671957478490332e-06, + "loss": 0.5189, + "num_input_tokens_seen": 12587440, + "step": 19295 + }, + { + "epoch": 11.379716981132075, + "grad_norm": 2.346397876739502, + "learning_rate": 4.669390350983574e-06, + "loss": 0.6201, + "num_input_tokens_seen": 12592208, + "step": 19300 + }, + { + "epoch": 11.382665094339623, + "grad_norm": 2.698775053024292, + "learning_rate": 4.666823311007145e-06, + "loss": 0.2941, + "num_input_tokens_seen": 12595504, + "step": 19305 + }, + { + "epoch": 11.38561320754717, + "grad_norm": 4.345825672149658, + "learning_rate": 4.664256359240681e-06, + "loss": 0.3889, + "num_input_tokens_seen": 12597776, + "step": 19310 + }, + { + "epoch": 11.388561320754716, + "grad_norm": 2.9866530895233154, + "learning_rate": 4.661689496363793e-06, + "loss": 0.2552, + "num_input_tokens_seen": 12600368, + "step": 19315 + }, + { + "epoch": 11.391509433962264, + "grad_norm": 1.9587651491165161, + "learning_rate": 4.659122723056068e-06, + "loss": 0.32, + "num_input_tokens_seen": 12603664, + "step": 19320 + }, + { + "epoch": 11.39445754716981, + "grad_norm": 2.97615647315979, + "learning_rate": 4.656556039997072e-06, + "loss": 0.3748, + "num_input_tokens_seen": 12606544, + "step": 19325 + }, + { + "epoch": 11.397405660377359, + "grad_norm": 2.449540138244629, + "learning_rate": 4.653989447866345e-06, + "loss": 0.3998, + "num_input_tokens_seen": 12610160, + "step": 19330 + }, + { + "epoch": 11.400353773584905, + "grad_norm": 3.9174702167510986, + "learning_rate": 4.651422947343401e-06, + "loss": 0.29, + "num_input_tokens_seen": 12613584, + "step": 19335 + }, + { + "epoch": 11.403301886792454, + "grad_norm": 2.467662811279297, + "learning_rate": 4.6488565391077355e-06, + "loss": 0.2793, + "num_input_tokens_seen": 12616560, + "step": 19340 + }, + { + "epoch": 11.40625, + "grad_norm": 3.8434083461761475, + "learning_rate": 4.646290223838815e-06, + "loss": 0.3551, + "num_input_tokens_seen": 12619184, + "step": 19345 + }, + { + "epoch": 11.409198113207546, + "grad_norm": 2.0932297706604004, + "learning_rate": 4.64372400221608e-06, + "loss": 0.4112, + "num_input_tokens_seen": 12622384, + "step": 19350 + }, + { + "epoch": 11.412146226415095, + "grad_norm": 2.6615967750549316, + "learning_rate": 4.641157874918953e-06, + "loss": 0.4107, + "num_input_tokens_seen": 12625296, + "step": 19355 + }, + { + "epoch": 11.415094339622641, + "grad_norm": 2.929427146911621, + "learning_rate": 4.6385918426268245e-06, + "loss": 0.3373, + "num_input_tokens_seen": 12628496, + "step": 19360 + }, + { + "epoch": 11.41804245283019, + "grad_norm": 2.00095272064209, + "learning_rate": 4.636025906019062e-06, + "loss": 0.3276, + "num_input_tokens_seen": 12631824, + "step": 19365 + }, + { + "epoch": 11.420990566037736, + "grad_norm": 5.305765151977539, + "learning_rate": 4.6334600657750115e-06, + "loss": 0.4927, + "num_input_tokens_seen": 12634672, + "step": 19370 + }, + { + "epoch": 11.423938679245284, + "grad_norm": 4.474178791046143, + "learning_rate": 4.6308943225739855e-06, + "loss": 0.4817, + "num_input_tokens_seen": 12637648, + "step": 19375 + }, + { + "epoch": 11.42688679245283, + "grad_norm": 3.0600359439849854, + "learning_rate": 4.628328677095281e-06, + "loss": 0.4268, + "num_input_tokens_seen": 12640752, + "step": 19380 + }, + { + "epoch": 11.429834905660377, + "grad_norm": 5.619620323181152, + "learning_rate": 4.625763130018159e-06, + "loss": 0.3703, + "num_input_tokens_seen": 12643952, + "step": 19385 + }, + { + "epoch": 11.432783018867925, + "grad_norm": 2.64030122756958, + "learning_rate": 4.6231976820218635e-06, + "loss": 0.3529, + "num_input_tokens_seen": 12647216, + "step": 19390 + }, + { + "epoch": 11.435731132075471, + "grad_norm": 3.766113519668579, + "learning_rate": 4.620632333785605e-06, + "loss": 0.3568, + "num_input_tokens_seen": 12650224, + "step": 19395 + }, + { + "epoch": 11.43867924528302, + "grad_norm": 2.5583231449127197, + "learning_rate": 4.618067085988569e-06, + "loss": 0.3297, + "num_input_tokens_seen": 12653488, + "step": 19400 + }, + { + "epoch": 11.441627358490566, + "grad_norm": 3.055851697921753, + "learning_rate": 4.615501939309921e-06, + "loss": 0.2986, + "num_input_tokens_seen": 12657232, + "step": 19405 + }, + { + "epoch": 11.444575471698114, + "grad_norm": 3.5484273433685303, + "learning_rate": 4.612936894428791e-06, + "loss": 0.3339, + "num_input_tokens_seen": 12660400, + "step": 19410 + }, + { + "epoch": 11.44752358490566, + "grad_norm": 3.0802512168884277, + "learning_rate": 4.610371952024285e-06, + "loss": 0.5552, + "num_input_tokens_seen": 12664944, + "step": 19415 + }, + { + "epoch": 11.450471698113208, + "grad_norm": 3.457401990890503, + "learning_rate": 4.607807112775485e-06, + "loss": 0.2934, + "num_input_tokens_seen": 12668272, + "step": 19420 + }, + { + "epoch": 11.453419811320755, + "grad_norm": 2.4113500118255615, + "learning_rate": 4.605242377361441e-06, + "loss": 0.3197, + "num_input_tokens_seen": 12672272, + "step": 19425 + }, + { + "epoch": 11.456367924528301, + "grad_norm": 1.6858993768692017, + "learning_rate": 4.60267774646118e-06, + "loss": 0.3842, + "num_input_tokens_seen": 12675760, + "step": 19430 + }, + { + "epoch": 11.45931603773585, + "grad_norm": 3.6357429027557373, + "learning_rate": 4.600113220753698e-06, + "loss": 0.4433, + "num_input_tokens_seen": 12678896, + "step": 19435 + }, + { + "epoch": 11.462264150943396, + "grad_norm": 3.8535728454589844, + "learning_rate": 4.597548800917964e-06, + "loss": 0.387, + "num_input_tokens_seen": 12682256, + "step": 19440 + }, + { + "epoch": 11.465212264150944, + "grad_norm": 2.2702760696411133, + "learning_rate": 4.594984487632919e-06, + "loss": 0.4222, + "num_input_tokens_seen": 12686544, + "step": 19445 + }, + { + "epoch": 11.46816037735849, + "grad_norm": 1.961531162261963, + "learning_rate": 4.592420281577478e-06, + "loss": 0.3709, + "num_input_tokens_seen": 12689520, + "step": 19450 + }, + { + "epoch": 11.471108490566039, + "grad_norm": 2.4612793922424316, + "learning_rate": 4.589856183430521e-06, + "loss": 0.3176, + "num_input_tokens_seen": 12692336, + "step": 19455 + }, + { + "epoch": 11.474056603773585, + "grad_norm": 3.8923072814941406, + "learning_rate": 4.58729219387091e-06, + "loss": 0.3825, + "num_input_tokens_seen": 12696784, + "step": 19460 + }, + { + "epoch": 11.477004716981131, + "grad_norm": 3.540701389312744, + "learning_rate": 4.584728313577468e-06, + "loss": 0.2807, + "num_input_tokens_seen": 12701584, + "step": 19465 + }, + { + "epoch": 11.47995283018868, + "grad_norm": 5.023279190063477, + "learning_rate": 4.582164543228993e-06, + "loss": 0.3338, + "num_input_tokens_seen": 12704624, + "step": 19470 + }, + { + "epoch": 11.482900943396226, + "grad_norm": 2.656906843185425, + "learning_rate": 4.5796008835042574e-06, + "loss": 0.3215, + "num_input_tokens_seen": 12708368, + "step": 19475 + }, + { + "epoch": 11.485849056603774, + "grad_norm": 2.5755786895751953, + "learning_rate": 4.577037335082e-06, + "loss": 0.4306, + "num_input_tokens_seen": 12712304, + "step": 19480 + }, + { + "epoch": 11.48879716981132, + "grad_norm": 4.281582832336426, + "learning_rate": 4.574473898640933e-06, + "loss": 0.437, + "num_input_tokens_seen": 12715600, + "step": 19485 + }, + { + "epoch": 11.491745283018869, + "grad_norm": 2.88093638420105, + "learning_rate": 4.571910574859732e-06, + "loss": 0.358, + "num_input_tokens_seen": 12718192, + "step": 19490 + }, + { + "epoch": 11.494693396226415, + "grad_norm": 2.4662725925445557, + "learning_rate": 4.5693473644170535e-06, + "loss": 0.3712, + "num_input_tokens_seen": 12723344, + "step": 19495 + }, + { + "epoch": 11.497641509433961, + "grad_norm": 4.7870564460754395, + "learning_rate": 4.566784267991516e-06, + "loss": 0.395, + "num_input_tokens_seen": 12726480, + "step": 19500 + }, + { + "epoch": 11.50058962264151, + "grad_norm": 4.4008283615112305, + "learning_rate": 4.564221286261709e-06, + "loss": 0.4189, + "num_input_tokens_seen": 12729936, + "step": 19505 + }, + { + "epoch": 11.503537735849056, + "grad_norm": 4.64227819442749, + "learning_rate": 4.5616584199061964e-06, + "loss": 0.4949, + "num_input_tokens_seen": 12732560, + "step": 19510 + }, + { + "epoch": 11.506485849056604, + "grad_norm": 2.7179620265960693, + "learning_rate": 4.559095669603506e-06, + "loss": 0.3673, + "num_input_tokens_seen": 12736592, + "step": 19515 + }, + { + "epoch": 11.50943396226415, + "grad_norm": 2.6628541946411133, + "learning_rate": 4.556533036032136e-06, + "loss": 0.2726, + "num_input_tokens_seen": 12739824, + "step": 19520 + }, + { + "epoch": 11.512382075471699, + "grad_norm": 3.0179929733276367, + "learning_rate": 4.553970519870557e-06, + "loss": 0.3249, + "num_input_tokens_seen": 12742864, + "step": 19525 + }, + { + "epoch": 11.515330188679245, + "grad_norm": 4.008169174194336, + "learning_rate": 4.551408121797205e-06, + "loss": 0.3526, + "num_input_tokens_seen": 12746480, + "step": 19530 + }, + { + "epoch": 11.518278301886792, + "grad_norm": 2.7741737365722656, + "learning_rate": 4.548845842490486e-06, + "loss": 0.3497, + "num_input_tokens_seen": 12749104, + "step": 19535 + }, + { + "epoch": 11.52122641509434, + "grad_norm": 3.064547061920166, + "learning_rate": 4.5462836826287745e-06, + "loss": 0.2583, + "num_input_tokens_seen": 12753360, + "step": 19540 + }, + { + "epoch": 11.524174528301886, + "grad_norm": 3.454883337020874, + "learning_rate": 4.543721642890414e-06, + "loss": 0.3361, + "num_input_tokens_seen": 12756208, + "step": 19545 + }, + { + "epoch": 11.527122641509434, + "grad_norm": 8.238747596740723, + "learning_rate": 4.541159723953714e-06, + "loss": 0.3636, + "num_input_tokens_seen": 12759120, + "step": 19550 + }, + { + "epoch": 11.53007075471698, + "grad_norm": 4.033265113830566, + "learning_rate": 4.538597926496955e-06, + "loss": 0.3877, + "num_input_tokens_seen": 12761968, + "step": 19555 + }, + { + "epoch": 11.533018867924529, + "grad_norm": 3.488154411315918, + "learning_rate": 4.536036251198384e-06, + "loss": 0.3654, + "num_input_tokens_seen": 12765232, + "step": 19560 + }, + { + "epoch": 11.535966981132075, + "grad_norm": 1.8889278173446655, + "learning_rate": 4.5334746987362124e-06, + "loss": 0.3726, + "num_input_tokens_seen": 12768976, + "step": 19565 + }, + { + "epoch": 11.538915094339622, + "grad_norm": 2.3891000747680664, + "learning_rate": 4.530913269788627e-06, + "loss": 0.3561, + "num_input_tokens_seen": 12772816, + "step": 19570 + }, + { + "epoch": 11.54186320754717, + "grad_norm": 2.2001259326934814, + "learning_rate": 4.528351965033775e-06, + "loss": 0.4169, + "num_input_tokens_seen": 12777488, + "step": 19575 + }, + { + "epoch": 11.544811320754716, + "grad_norm": 2.8217122554779053, + "learning_rate": 4.525790785149774e-06, + "loss": 0.3557, + "num_input_tokens_seen": 12781040, + "step": 19580 + }, + { + "epoch": 11.547759433962264, + "grad_norm": 6.920278072357178, + "learning_rate": 4.523229730814705e-06, + "loss": 0.4655, + "num_input_tokens_seen": 12784080, + "step": 19585 + }, + { + "epoch": 11.55070754716981, + "grad_norm": 2.715130090713501, + "learning_rate": 4.52066880270662e-06, + "loss": 0.2927, + "num_input_tokens_seen": 12786736, + "step": 19590 + }, + { + "epoch": 11.553655660377359, + "grad_norm": 2.232081890106201, + "learning_rate": 4.518108001503536e-06, + "loss": 0.3374, + "num_input_tokens_seen": 12790000, + "step": 19595 + }, + { + "epoch": 11.556603773584905, + "grad_norm": 2.4387500286102295, + "learning_rate": 4.515547327883434e-06, + "loss": 0.3287, + "num_input_tokens_seen": 12793456, + "step": 19600 + }, + { + "epoch": 11.559551886792454, + "grad_norm": 2.5698351860046387, + "learning_rate": 4.512986782524266e-06, + "loss": 0.3315, + "num_input_tokens_seen": 12796784, + "step": 19605 + }, + { + "epoch": 11.5625, + "grad_norm": 1.8755861520767212, + "learning_rate": 4.510426366103946e-06, + "loss": 0.253, + "num_input_tokens_seen": 12799632, + "step": 19610 + }, + { + "epoch": 11.565448113207546, + "grad_norm": 3.3739631175994873, + "learning_rate": 4.5078660793003544e-06, + "loss": 0.4082, + "num_input_tokens_seen": 12802736, + "step": 19615 + }, + { + "epoch": 11.568396226415095, + "grad_norm": 3.7542672157287598, + "learning_rate": 4.505305922791341e-06, + "loss": 0.3562, + "num_input_tokens_seen": 12806384, + "step": 19620 + }, + { + "epoch": 11.571344339622641, + "grad_norm": 2.9826900959014893, + "learning_rate": 4.502745897254716e-06, + "loss": 0.3332, + "num_input_tokens_seen": 12809456, + "step": 19625 + }, + { + "epoch": 11.57429245283019, + "grad_norm": 2.2935402393341064, + "learning_rate": 4.500186003368257e-06, + "loss": 0.337, + "num_input_tokens_seen": 12812528, + "step": 19630 + }, + { + "epoch": 11.577240566037736, + "grad_norm": 3.0368728637695312, + "learning_rate": 4.497626241809709e-06, + "loss": 0.3819, + "num_input_tokens_seen": 12816048, + "step": 19635 + }, + { + "epoch": 11.580188679245284, + "grad_norm": 2.782705307006836, + "learning_rate": 4.4950666132567775e-06, + "loss": 0.2827, + "num_input_tokens_seen": 12818992, + "step": 19640 + }, + { + "epoch": 11.58313679245283, + "grad_norm": 4.114466667175293, + "learning_rate": 4.492507118387136e-06, + "loss": 0.4781, + "num_input_tokens_seen": 12822064, + "step": 19645 + }, + { + "epoch": 11.586084905660378, + "grad_norm": 1.5947636365890503, + "learning_rate": 4.4899477578784235e-06, + "loss": 0.335, + "num_input_tokens_seen": 12825904, + "step": 19650 + }, + { + "epoch": 11.589033018867925, + "grad_norm": 2.4638819694519043, + "learning_rate": 4.487388532408239e-06, + "loss": 0.2272, + "num_input_tokens_seen": 12830160, + "step": 19655 + }, + { + "epoch": 11.591981132075471, + "grad_norm": 4.413480281829834, + "learning_rate": 4.48482944265415e-06, + "loss": 0.338, + "num_input_tokens_seen": 12833360, + "step": 19660 + }, + { + "epoch": 11.59492924528302, + "grad_norm": 3.8675954341888428, + "learning_rate": 4.482270489293685e-06, + "loss": 0.2952, + "num_input_tokens_seen": 12836208, + "step": 19665 + }, + { + "epoch": 11.597877358490566, + "grad_norm": 2.7623445987701416, + "learning_rate": 4.479711673004341e-06, + "loss": 0.3761, + "num_input_tokens_seen": 12839984, + "step": 19670 + }, + { + "epoch": 11.600825471698114, + "grad_norm": 2.715082883834839, + "learning_rate": 4.477152994463575e-06, + "loss": 0.2245, + "num_input_tokens_seen": 12843088, + "step": 19675 + }, + { + "epoch": 11.60377358490566, + "grad_norm": 2.952995538711548, + "learning_rate": 4.474594454348805e-06, + "loss": 0.3974, + "num_input_tokens_seen": 12846736, + "step": 19680 + }, + { + "epoch": 11.606721698113208, + "grad_norm": 3.521667242050171, + "learning_rate": 4.472036053337419e-06, + "loss": 0.3732, + "num_input_tokens_seen": 12849200, + "step": 19685 + }, + { + "epoch": 11.609669811320755, + "grad_norm": 2.310652732849121, + "learning_rate": 4.469477792106761e-06, + "loss": 0.358, + "num_input_tokens_seen": 12852080, + "step": 19690 + }, + { + "epoch": 11.612617924528301, + "grad_norm": 2.3622961044311523, + "learning_rate": 4.466919671334146e-06, + "loss": 0.3108, + "num_input_tokens_seen": 12854864, + "step": 19695 + }, + { + "epoch": 11.61556603773585, + "grad_norm": 3.1215553283691406, + "learning_rate": 4.4643616916968456e-06, + "loss": 0.3447, + "num_input_tokens_seen": 12857776, + "step": 19700 + }, + { + "epoch": 11.618514150943396, + "grad_norm": 3.8869869709014893, + "learning_rate": 4.461803853872095e-06, + "loss": 0.3862, + "num_input_tokens_seen": 12861360, + "step": 19705 + }, + { + "epoch": 11.621462264150944, + "grad_norm": 2.6695749759674072, + "learning_rate": 4.459246158537094e-06, + "loss": 0.2916, + "num_input_tokens_seen": 12864976, + "step": 19710 + }, + { + "epoch": 11.62441037735849, + "grad_norm": 2.401380777359009, + "learning_rate": 4.456688606369003e-06, + "loss": 0.3191, + "num_input_tokens_seen": 12867856, + "step": 19715 + }, + { + "epoch": 11.627358490566039, + "grad_norm": 1.7639538049697876, + "learning_rate": 4.4541311980449446e-06, + "loss": 0.251, + "num_input_tokens_seen": 12876464, + "step": 19720 + }, + { + "epoch": 11.630306603773585, + "grad_norm": 4.165428161621094, + "learning_rate": 4.4515739342420045e-06, + "loss": 0.3129, + "num_input_tokens_seen": 12879952, + "step": 19725 + }, + { + "epoch": 11.633254716981131, + "grad_norm": 2.7343051433563232, + "learning_rate": 4.4490168156372294e-06, + "loss": 0.4182, + "num_input_tokens_seen": 12883088, + "step": 19730 + }, + { + "epoch": 11.63620283018868, + "grad_norm": 2.850191354751587, + "learning_rate": 4.446459842907626e-06, + "loss": 0.3258, + "num_input_tokens_seen": 12885840, + "step": 19735 + }, + { + "epoch": 11.639150943396226, + "grad_norm": 2.14508318901062, + "learning_rate": 4.443903016730165e-06, + "loss": 0.3486, + "num_input_tokens_seen": 12888656, + "step": 19740 + }, + { + "epoch": 11.642099056603774, + "grad_norm": 3.1416988372802734, + "learning_rate": 4.4413463377817775e-06, + "loss": 0.3017, + "num_input_tokens_seen": 12892880, + "step": 19745 + }, + { + "epoch": 11.64504716981132, + "grad_norm": 2.0887463092803955, + "learning_rate": 4.438789806739353e-06, + "loss": 0.3611, + "num_input_tokens_seen": 12895792, + "step": 19750 + }, + { + "epoch": 11.647995283018869, + "grad_norm": 4.08836030960083, + "learning_rate": 4.436233424279746e-06, + "loss": 0.3311, + "num_input_tokens_seen": 12898672, + "step": 19755 + }, + { + "epoch": 11.650943396226415, + "grad_norm": 2.666257619857788, + "learning_rate": 4.433677191079771e-06, + "loss": 0.4604, + "num_input_tokens_seen": 12902576, + "step": 19760 + }, + { + "epoch": 11.653891509433961, + "grad_norm": 2.1791186332702637, + "learning_rate": 4.431121107816201e-06, + "loss": 0.3461, + "num_input_tokens_seen": 12905616, + "step": 19765 + }, + { + "epoch": 11.65683962264151, + "grad_norm": 2.8505682945251465, + "learning_rate": 4.4285651751657676e-06, + "loss": 0.4085, + "num_input_tokens_seen": 12908752, + "step": 19770 + }, + { + "epoch": 11.659787735849056, + "grad_norm": 2.407656669616699, + "learning_rate": 4.4260093938051685e-06, + "loss": 0.4832, + "num_input_tokens_seen": 12911824, + "step": 19775 + }, + { + "epoch": 11.662735849056604, + "grad_norm": 3.032538652420044, + "learning_rate": 4.423453764411056e-06, + "loss": 0.2572, + "num_input_tokens_seen": 12914992, + "step": 19780 + }, + { + "epoch": 11.66568396226415, + "grad_norm": 5.86004638671875, + "learning_rate": 4.4208982876600425e-06, + "loss": 0.3632, + "num_input_tokens_seen": 12917744, + "step": 19785 + }, + { + "epoch": 11.668632075471699, + "grad_norm": 2.6655428409576416, + "learning_rate": 4.418342964228705e-06, + "loss": 0.4312, + "num_input_tokens_seen": 12921424, + "step": 19790 + }, + { + "epoch": 11.671580188679245, + "grad_norm": 2.5237255096435547, + "learning_rate": 4.415787794793574e-06, + "loss": 0.349, + "num_input_tokens_seen": 12924336, + "step": 19795 + }, + { + "epoch": 11.674528301886792, + "grad_norm": 2.736051559448242, + "learning_rate": 4.4132327800311414e-06, + "loss": 0.3619, + "num_input_tokens_seen": 12927728, + "step": 19800 + }, + { + "epoch": 11.67747641509434, + "grad_norm": 3.1326725482940674, + "learning_rate": 4.41067792061786e-06, + "loss": 0.4579, + "num_input_tokens_seen": 12932336, + "step": 19805 + }, + { + "epoch": 11.680424528301886, + "grad_norm": 3.5012478828430176, + "learning_rate": 4.408123217230139e-06, + "loss": 0.324, + "num_input_tokens_seen": 12936688, + "step": 19810 + }, + { + "epoch": 11.683372641509434, + "grad_norm": 2.7109692096710205, + "learning_rate": 4.4055686705443464e-06, + "loss": 0.3054, + "num_input_tokens_seen": 12939120, + "step": 19815 + }, + { + "epoch": 11.68632075471698, + "grad_norm": 4.324853420257568, + "learning_rate": 4.4030142812368106e-06, + "loss": 0.5332, + "num_input_tokens_seen": 12941712, + "step": 19820 + }, + { + "epoch": 11.689268867924529, + "grad_norm": 2.4789862632751465, + "learning_rate": 4.400460049983817e-06, + "loss": 0.3176, + "num_input_tokens_seen": 12944560, + "step": 19825 + }, + { + "epoch": 11.692216981132075, + "grad_norm": 4.248828887939453, + "learning_rate": 4.397905977461608e-06, + "loss": 0.3162, + "num_input_tokens_seen": 12947792, + "step": 19830 + }, + { + "epoch": 11.695165094339622, + "grad_norm": 3.450928211212158, + "learning_rate": 4.395352064346387e-06, + "loss": 0.381, + "num_input_tokens_seen": 12951184, + "step": 19835 + }, + { + "epoch": 11.69811320754717, + "grad_norm": 3.4724326133728027, + "learning_rate": 4.392798311314314e-06, + "loss": 0.3052, + "num_input_tokens_seen": 12953872, + "step": 19840 + }, + { + "epoch": 11.701061320754716, + "grad_norm": 5.270394325256348, + "learning_rate": 4.390244719041502e-06, + "loss": 0.3202, + "num_input_tokens_seen": 12956848, + "step": 19845 + }, + { + "epoch": 11.704009433962264, + "grad_norm": 2.672513008117676, + "learning_rate": 4.387691288204029e-06, + "loss": 0.3853, + "num_input_tokens_seen": 12959792, + "step": 19850 + }, + { + "epoch": 11.70695754716981, + "grad_norm": 3.9002599716186523, + "learning_rate": 4.385138019477928e-06, + "loss": 0.4453, + "num_input_tokens_seen": 12963728, + "step": 19855 + }, + { + "epoch": 11.709905660377359, + "grad_norm": 2.542320728302002, + "learning_rate": 4.3825849135391864e-06, + "loss": 0.3877, + "num_input_tokens_seen": 12966928, + "step": 19860 + }, + { + "epoch": 11.712853773584905, + "grad_norm": 2.5021073818206787, + "learning_rate": 4.380031971063748e-06, + "loss": 0.3766, + "num_input_tokens_seen": 12970544, + "step": 19865 + }, + { + "epoch": 11.715801886792454, + "grad_norm": 3.959275245666504, + "learning_rate": 4.37747919272752e-06, + "loss": 0.2545, + "num_input_tokens_seen": 12973904, + "step": 19870 + }, + { + "epoch": 11.71875, + "grad_norm": 3.515876293182373, + "learning_rate": 4.374926579206357e-06, + "loss": 0.3655, + "num_input_tokens_seen": 12977232, + "step": 19875 + }, + { + "epoch": 11.721698113207546, + "grad_norm": 2.9236373901367188, + "learning_rate": 4.372374131176075e-06, + "loss": 0.3731, + "num_input_tokens_seen": 12980016, + "step": 19880 + }, + { + "epoch": 11.724646226415095, + "grad_norm": 3.3351829051971436, + "learning_rate": 4.369821849312449e-06, + "loss": 0.3131, + "num_input_tokens_seen": 12982736, + "step": 19885 + }, + { + "epoch": 11.727594339622641, + "grad_norm": 3.3169569969177246, + "learning_rate": 4.367269734291203e-06, + "loss": 0.3864, + "num_input_tokens_seen": 12986480, + "step": 19890 + }, + { + "epoch": 11.73054245283019, + "grad_norm": 2.5217974185943604, + "learning_rate": 4.364717786788022e-06, + "loss": 0.3393, + "num_input_tokens_seen": 12989296, + "step": 19895 + }, + { + "epoch": 11.733490566037736, + "grad_norm": 2.3677427768707275, + "learning_rate": 4.362166007478545e-06, + "loss": 0.304, + "num_input_tokens_seen": 12993008, + "step": 19900 + }, + { + "epoch": 11.736438679245284, + "grad_norm": 3.122501850128174, + "learning_rate": 4.3596143970383665e-06, + "loss": 0.3506, + "num_input_tokens_seen": 12996528, + "step": 19905 + }, + { + "epoch": 11.73938679245283, + "grad_norm": 1.9580241441726685, + "learning_rate": 4.357062956143035e-06, + "loss": 0.2346, + "num_input_tokens_seen": 12999888, + "step": 19910 + }, + { + "epoch": 11.742334905660378, + "grad_norm": 5.113658905029297, + "learning_rate": 4.354511685468059e-06, + "loss": 0.3916, + "num_input_tokens_seen": 13003216, + "step": 19915 + }, + { + "epoch": 11.745283018867925, + "grad_norm": 3.3515193462371826, + "learning_rate": 4.351960585688894e-06, + "loss": 0.4338, + "num_input_tokens_seen": 13006736, + "step": 19920 + }, + { + "epoch": 11.748231132075471, + "grad_norm": 4.886715888977051, + "learning_rate": 4.349409657480959e-06, + "loss": 0.3118, + "num_input_tokens_seen": 13009680, + "step": 19925 + }, + { + "epoch": 11.75117924528302, + "grad_norm": 2.104062795639038, + "learning_rate": 4.34685890151962e-06, + "loss": 0.2942, + "num_input_tokens_seen": 13013040, + "step": 19930 + }, + { + "epoch": 11.754127358490566, + "grad_norm": 2.9694595336914062, + "learning_rate": 4.344308318480201e-06, + "loss": 0.3928, + "num_input_tokens_seen": 13015504, + "step": 19935 + }, + { + "epoch": 11.757075471698114, + "grad_norm": 2.2153384685516357, + "learning_rate": 4.341757909037981e-06, + "loss": 0.434, + "num_input_tokens_seen": 13019152, + "step": 19940 + }, + { + "epoch": 11.76002358490566, + "grad_norm": 5.004147052764893, + "learning_rate": 4.33920767386819e-06, + "loss": 0.3215, + "num_input_tokens_seen": 13022128, + "step": 19945 + }, + { + "epoch": 11.762971698113208, + "grad_norm": 2.9987664222717285, + "learning_rate": 4.336657613646017e-06, + "loss": 0.3521, + "num_input_tokens_seen": 13024784, + "step": 19950 + }, + { + "epoch": 11.765919811320755, + "grad_norm": 1.5151793956756592, + "learning_rate": 4.3341077290466e-06, + "loss": 0.2877, + "num_input_tokens_seen": 13028176, + "step": 19955 + }, + { + "epoch": 11.768867924528301, + "grad_norm": 3.2525863647460938, + "learning_rate": 4.331558020745031e-06, + "loss": 0.3249, + "num_input_tokens_seen": 13032272, + "step": 19960 + }, + { + "epoch": 11.77181603773585, + "grad_norm": 4.664340496063232, + "learning_rate": 4.329008489416357e-06, + "loss": 0.3644, + "num_input_tokens_seen": 13034832, + "step": 19965 + }, + { + "epoch": 11.774764150943396, + "grad_norm": 3.4732789993286133, + "learning_rate": 4.326459135735576e-06, + "loss": 0.4821, + "num_input_tokens_seen": 13037200, + "step": 19970 + }, + { + "epoch": 11.777712264150944, + "grad_norm": 2.2168478965759277, + "learning_rate": 4.323909960377644e-06, + "loss": 0.2726, + "num_input_tokens_seen": 13041680, + "step": 19975 + }, + { + "epoch": 11.78066037735849, + "grad_norm": 2.3234710693359375, + "learning_rate": 4.3213609640174625e-06, + "loss": 0.3055, + "num_input_tokens_seen": 13044528, + "step": 19980 + }, + { + "epoch": 11.783608490566039, + "grad_norm": 2.16900372505188, + "learning_rate": 4.318812147329889e-06, + "loss": 0.4243, + "num_input_tokens_seen": 13047248, + "step": 19985 + }, + { + "epoch": 11.786556603773585, + "grad_norm": 2.0913307666778564, + "learning_rate": 4.316263510989737e-06, + "loss": 0.3191, + "num_input_tokens_seen": 13050928, + "step": 19990 + }, + { + "epoch": 11.789504716981131, + "grad_norm": 5.8368096351623535, + "learning_rate": 4.313715055671768e-06, + "loss": 0.4503, + "num_input_tokens_seen": 13053616, + "step": 19995 + }, + { + "epoch": 11.79245283018868, + "grad_norm": 6.281067371368408, + "learning_rate": 4.311166782050694e-06, + "loss": 0.3559, + "num_input_tokens_seen": 13056400, + "step": 20000 + }, + { + "epoch": 11.795400943396226, + "grad_norm": 2.9095821380615234, + "learning_rate": 4.308618690801184e-06, + "loss": 0.3114, + "num_input_tokens_seen": 13059184, + "step": 20005 + }, + { + "epoch": 11.798349056603774, + "grad_norm": 1.770971655845642, + "learning_rate": 4.3060707825978564e-06, + "loss": 0.2989, + "num_input_tokens_seen": 13062576, + "step": 20010 + }, + { + "epoch": 11.80129716981132, + "grad_norm": 2.262758731842041, + "learning_rate": 4.303523058115278e-06, + "loss": 0.2936, + "num_input_tokens_seen": 13065712, + "step": 20015 + }, + { + "epoch": 11.804245283018869, + "grad_norm": 4.326164245605469, + "learning_rate": 4.300975518027972e-06, + "loss": 0.4175, + "num_input_tokens_seen": 13068848, + "step": 20020 + }, + { + "epoch": 11.807193396226415, + "grad_norm": 5.070953369140625, + "learning_rate": 4.298428163010411e-06, + "loss": 0.4302, + "num_input_tokens_seen": 13071408, + "step": 20025 + }, + { + "epoch": 11.810141509433961, + "grad_norm": 4.121811866760254, + "learning_rate": 4.295880993737016e-06, + "loss": 0.3091, + "num_input_tokens_seen": 13074736, + "step": 20030 + }, + { + "epoch": 11.81308962264151, + "grad_norm": 3.1230826377868652, + "learning_rate": 4.293334010882164e-06, + "loss": 0.2936, + "num_input_tokens_seen": 13077168, + "step": 20035 + }, + { + "epoch": 11.816037735849056, + "grad_norm": 2.900219202041626, + "learning_rate": 4.290787215120178e-06, + "loss": 0.3821, + "num_input_tokens_seen": 13080656, + "step": 20040 + }, + { + "epoch": 11.818985849056604, + "grad_norm": 3.019371747970581, + "learning_rate": 4.2882406071253315e-06, + "loss": 0.3946, + "num_input_tokens_seen": 13084496, + "step": 20045 + }, + { + "epoch": 11.82193396226415, + "grad_norm": 3.298809051513672, + "learning_rate": 4.285694187571852e-06, + "loss": 0.326, + "num_input_tokens_seen": 13086864, + "step": 20050 + }, + { + "epoch": 11.824882075471699, + "grad_norm": 3.491697072982788, + "learning_rate": 4.283147957133917e-06, + "loss": 0.3551, + "num_input_tokens_seen": 13090256, + "step": 20055 + }, + { + "epoch": 11.827830188679245, + "grad_norm": 3.8841865062713623, + "learning_rate": 4.280601916485648e-06, + "loss": 0.5957, + "num_input_tokens_seen": 13094544, + "step": 20060 + }, + { + "epoch": 11.830778301886792, + "grad_norm": 3.2441141605377197, + "learning_rate": 4.278056066301123e-06, + "loss": 0.373, + "num_input_tokens_seen": 13098096, + "step": 20065 + }, + { + "epoch": 11.83372641509434, + "grad_norm": 5.585747241973877, + "learning_rate": 4.275510407254366e-06, + "loss": 0.3699, + "num_input_tokens_seen": 13101648, + "step": 20070 + }, + { + "epoch": 11.836674528301886, + "grad_norm": 5.809953212738037, + "learning_rate": 4.27296494001935e-06, + "loss": 0.5472, + "num_input_tokens_seen": 13103760, + "step": 20075 + }, + { + "epoch": 11.839622641509434, + "grad_norm": 2.4274473190307617, + "learning_rate": 4.2704196652700005e-06, + "loss": 0.3275, + "num_input_tokens_seen": 13106256, + "step": 20080 + }, + { + "epoch": 11.84257075471698, + "grad_norm": 7.530351161956787, + "learning_rate": 4.2678745836801895e-06, + "loss": 0.3523, + "num_input_tokens_seen": 13109072, + "step": 20085 + }, + { + "epoch": 11.845518867924529, + "grad_norm": 2.7343080043792725, + "learning_rate": 4.265329695923736e-06, + "loss": 0.3538, + "num_input_tokens_seen": 13111312, + "step": 20090 + }, + { + "epoch": 11.848466981132075, + "grad_norm": 2.927044153213501, + "learning_rate": 4.262785002674412e-06, + "loss": 0.3267, + "num_input_tokens_seen": 13115408, + "step": 20095 + }, + { + "epoch": 11.851415094339622, + "grad_norm": 3.643876552581787, + "learning_rate": 4.260240504605938e-06, + "loss": 0.347, + "num_input_tokens_seen": 13117968, + "step": 20100 + }, + { + "epoch": 11.85436320754717, + "grad_norm": 2.1152777671813965, + "learning_rate": 4.2576962023919775e-06, + "loss": 0.3731, + "num_input_tokens_seen": 13121680, + "step": 20105 + }, + { + "epoch": 11.857311320754716, + "grad_norm": 2.127545118331909, + "learning_rate": 4.255152096706145e-06, + "loss": 0.3094, + "num_input_tokens_seen": 13124336, + "step": 20110 + }, + { + "epoch": 11.860259433962264, + "grad_norm": 2.6178183555603027, + "learning_rate": 4.252608188222007e-06, + "loss": 0.2977, + "num_input_tokens_seen": 13128240, + "step": 20115 + }, + { + "epoch": 11.86320754716981, + "grad_norm": 4.012906074523926, + "learning_rate": 4.250064477613071e-06, + "loss": 0.3837, + "num_input_tokens_seen": 13130928, + "step": 20120 + }, + { + "epoch": 11.866155660377359, + "grad_norm": 8.452526092529297, + "learning_rate": 4.247520965552797e-06, + "loss": 0.3878, + "num_input_tokens_seen": 13133360, + "step": 20125 + }, + { + "epoch": 11.869103773584905, + "grad_norm": 3.6721739768981934, + "learning_rate": 4.24497765271459e-06, + "loss": 0.3664, + "num_input_tokens_seen": 13135760, + "step": 20130 + }, + { + "epoch": 11.872051886792454, + "grad_norm": 2.9720897674560547, + "learning_rate": 4.242434539771804e-06, + "loss": 0.23, + "num_input_tokens_seen": 13138736, + "step": 20135 + }, + { + "epoch": 11.875, + "grad_norm": 4.060837745666504, + "learning_rate": 4.239891627397737e-06, + "loss": 0.3659, + "num_input_tokens_seen": 13142320, + "step": 20140 + }, + { + "epoch": 11.877948113207546, + "grad_norm": 3.6309335231781006, + "learning_rate": 4.237348916265638e-06, + "loss": 0.314, + "num_input_tokens_seen": 13145424, + "step": 20145 + }, + { + "epoch": 11.880896226415095, + "grad_norm": 5.251545429229736, + "learning_rate": 4.234806407048702e-06, + "loss": 0.3222, + "num_input_tokens_seen": 13148272, + "step": 20150 + }, + { + "epoch": 11.883844339622641, + "grad_norm": 3.9953396320343018, + "learning_rate": 4.232264100420066e-06, + "loss": 0.3203, + "num_input_tokens_seen": 13152944, + "step": 20155 + }, + { + "epoch": 11.88679245283019, + "grad_norm": 2.1548473834991455, + "learning_rate": 4.229721997052819e-06, + "loss": 0.3395, + "num_input_tokens_seen": 13157200, + "step": 20160 + }, + { + "epoch": 11.889740566037736, + "grad_norm": 3.2899868488311768, + "learning_rate": 4.227180097619993e-06, + "loss": 0.2976, + "num_input_tokens_seen": 13160080, + "step": 20165 + }, + { + "epoch": 11.892688679245284, + "grad_norm": 3.432182788848877, + "learning_rate": 4.224638402794566e-06, + "loss": 0.3071, + "num_input_tokens_seen": 13163088, + "step": 20170 + }, + { + "epoch": 11.89563679245283, + "grad_norm": 2.252676010131836, + "learning_rate": 4.222096913249464e-06, + "loss": 0.3437, + "num_input_tokens_seen": 13167440, + "step": 20175 + }, + { + "epoch": 11.898584905660378, + "grad_norm": 6.0276594161987305, + "learning_rate": 4.219555629657558e-06, + "loss": 0.3684, + "num_input_tokens_seen": 13170256, + "step": 20180 + }, + { + "epoch": 11.901533018867925, + "grad_norm": 2.1762845516204834, + "learning_rate": 4.21701455269166e-06, + "loss": 0.1981, + "num_input_tokens_seen": 13175504, + "step": 20185 + }, + { + "epoch": 11.904481132075471, + "grad_norm": 3.138164520263672, + "learning_rate": 4.2144736830245356e-06, + "loss": 0.3874, + "num_input_tokens_seen": 13178768, + "step": 20190 + }, + { + "epoch": 11.90742924528302, + "grad_norm": 2.4305903911590576, + "learning_rate": 4.211933021328889e-06, + "loss": 0.2877, + "num_input_tokens_seen": 13181456, + "step": 20195 + }, + { + "epoch": 11.910377358490566, + "grad_norm": 2.7549068927764893, + "learning_rate": 4.20939256827737e-06, + "loss": 0.432, + "num_input_tokens_seen": 13185008, + "step": 20200 + }, + { + "epoch": 11.913325471698114, + "grad_norm": 3.528595447540283, + "learning_rate": 4.206852324542578e-06, + "loss": 0.3265, + "num_input_tokens_seen": 13188976, + "step": 20205 + }, + { + "epoch": 11.91627358490566, + "grad_norm": 3.036463499069214, + "learning_rate": 4.2043122907970496e-06, + "loss": 0.3963, + "num_input_tokens_seen": 13191856, + "step": 20210 + }, + { + "epoch": 11.919221698113208, + "grad_norm": 2.2963151931762695, + "learning_rate": 4.2017724677132715e-06, + "loss": 0.3803, + "num_input_tokens_seen": 13194800, + "step": 20215 + }, + { + "epoch": 11.922169811320755, + "grad_norm": 2.1888458728790283, + "learning_rate": 4.1992328559636734e-06, + "loss": 0.4519, + "num_input_tokens_seen": 13197328, + "step": 20220 + }, + { + "epoch": 11.925117924528301, + "grad_norm": 2.5733821392059326, + "learning_rate": 4.196693456220628e-06, + "loss": 0.3107, + "num_input_tokens_seen": 13201104, + "step": 20225 + }, + { + "epoch": 11.92806603773585, + "grad_norm": 2.525388479232788, + "learning_rate": 4.194154269156452e-06, + "loss": 0.3754, + "num_input_tokens_seen": 13204624, + "step": 20230 + }, + { + "epoch": 11.931014150943396, + "grad_norm": 3.1193606853485107, + "learning_rate": 4.191615295443404e-06, + "loss": 0.4644, + "num_input_tokens_seen": 13207312, + "step": 20235 + }, + { + "epoch": 11.933962264150944, + "grad_norm": 2.6548690795898438, + "learning_rate": 4.189076535753692e-06, + "loss": 0.2378, + "num_input_tokens_seen": 13211184, + "step": 20240 + }, + { + "epoch": 11.93691037735849, + "grad_norm": 4.048726558685303, + "learning_rate": 4.186537990759464e-06, + "loss": 0.3686, + "num_input_tokens_seen": 13214064, + "step": 20245 + }, + { + "epoch": 11.939858490566039, + "grad_norm": 16.716733932495117, + "learning_rate": 4.183999661132806e-06, + "loss": 0.4844, + "num_input_tokens_seen": 13217136, + "step": 20250 + }, + { + "epoch": 11.942806603773585, + "grad_norm": 4.2716240882873535, + "learning_rate": 4.181461547545756e-06, + "loss": 0.2784, + "num_input_tokens_seen": 13220176, + "step": 20255 + }, + { + "epoch": 11.945754716981131, + "grad_norm": 2.855415105819702, + "learning_rate": 4.178923650670289e-06, + "loss": 0.369, + "num_input_tokens_seen": 13222608, + "step": 20260 + }, + { + "epoch": 11.94870283018868, + "grad_norm": 3.642786741256714, + "learning_rate": 4.176385971178324e-06, + "loss": 0.4154, + "num_input_tokens_seen": 13225904, + "step": 20265 + }, + { + "epoch": 11.951650943396226, + "grad_norm": 2.053516149520874, + "learning_rate": 4.1738485097417225e-06, + "loss": 0.376, + "num_input_tokens_seen": 13229136, + "step": 20270 + }, + { + "epoch": 11.954599056603774, + "grad_norm": 2.9656100273132324, + "learning_rate": 4.1713112670322886e-06, + "loss": 0.3338, + "num_input_tokens_seen": 13231792, + "step": 20275 + }, + { + "epoch": 11.95754716981132, + "grad_norm": 1.9471689462661743, + "learning_rate": 4.168774243721768e-06, + "loss": 0.3771, + "num_input_tokens_seen": 13238160, + "step": 20280 + }, + { + "epoch": 11.960495283018869, + "grad_norm": 6.785888195037842, + "learning_rate": 4.166237440481849e-06, + "loss": 0.3378, + "num_input_tokens_seen": 13242448, + "step": 20285 + }, + { + "epoch": 11.963443396226415, + "grad_norm": 2.9863133430480957, + "learning_rate": 4.163700857984162e-06, + "loss": 0.3214, + "num_input_tokens_seen": 13245648, + "step": 20290 + }, + { + "epoch": 11.966391509433961, + "grad_norm": 2.418766498565674, + "learning_rate": 4.161164496900275e-06, + "loss": 0.5059, + "num_input_tokens_seen": 13248752, + "step": 20295 + }, + { + "epoch": 11.96933962264151, + "grad_norm": 3.253074884414673, + "learning_rate": 4.1586283579017036e-06, + "loss": 0.3935, + "num_input_tokens_seen": 13251536, + "step": 20300 + }, + { + "epoch": 11.972287735849056, + "grad_norm": 2.796755075454712, + "learning_rate": 4.156092441659901e-06, + "loss": 0.2989, + "num_input_tokens_seen": 13254672, + "step": 20305 + }, + { + "epoch": 11.975235849056604, + "grad_norm": 2.7266576290130615, + "learning_rate": 4.153556748846261e-06, + "loss": 0.3406, + "num_input_tokens_seen": 13257680, + "step": 20310 + }, + { + "epoch": 11.97818396226415, + "grad_norm": 5.865846157073975, + "learning_rate": 4.15102128013212e-06, + "loss": 0.2483, + "num_input_tokens_seen": 13260656, + "step": 20315 + }, + { + "epoch": 11.981132075471699, + "grad_norm": 3.521742820739746, + "learning_rate": 4.1484860361887544e-06, + "loss": 0.3695, + "num_input_tokens_seen": 13263888, + "step": 20320 + }, + { + "epoch": 11.984080188679245, + "grad_norm": 4.036994457244873, + "learning_rate": 4.145951017687379e-06, + "loss": 0.4148, + "num_input_tokens_seen": 13266640, + "step": 20325 + }, + { + "epoch": 11.987028301886792, + "grad_norm": 4.236042022705078, + "learning_rate": 4.1434162252991524e-06, + "loss": 0.4365, + "num_input_tokens_seen": 13270096, + "step": 20330 + }, + { + "epoch": 11.98997641509434, + "grad_norm": 3.2309505939483643, + "learning_rate": 4.140881659695173e-06, + "loss": 0.3419, + "num_input_tokens_seen": 13273200, + "step": 20335 + }, + { + "epoch": 11.992924528301886, + "grad_norm": 2.9114160537719727, + "learning_rate": 4.138347321546477e-06, + "loss": 0.4011, + "num_input_tokens_seen": 13276400, + "step": 20340 + }, + { + "epoch": 11.995872641509434, + "grad_norm": 2.664261817932129, + "learning_rate": 4.13581321152404e-06, + "loss": 0.3915, + "num_input_tokens_seen": 13278704, + "step": 20345 + }, + { + "epoch": 11.99882075471698, + "grad_norm": 4.473373889923096, + "learning_rate": 4.133279330298781e-06, + "loss": 0.4077, + "num_input_tokens_seen": 13281168, + "step": 20350 + }, + { + "epoch": 12.0, + "eval_loss": 0.5487748980522156, + "eval_runtime": 19.3819, + "eval_samples_per_second": 87.504, + "eval_steps_per_second": 21.876, + "num_input_tokens_seen": 13281912, + "step": 20352 + }, + { + "epoch": 12.001768867924529, + "grad_norm": 2.6249730587005615, + "learning_rate": 4.130745678541555e-06, + "loss": 0.353, + "num_input_tokens_seen": 13283864, + "step": 20355 + }, + { + "epoch": 12.004716981132075, + "grad_norm": 1.9240474700927734, + "learning_rate": 4.128212256923155e-06, + "loss": 0.3585, + "num_input_tokens_seen": 13288760, + "step": 20360 + }, + { + "epoch": 12.007665094339623, + "grad_norm": 2.769350051879883, + "learning_rate": 4.125679066114318e-06, + "loss": 0.3218, + "num_input_tokens_seen": 13291704, + "step": 20365 + }, + { + "epoch": 12.01061320754717, + "grad_norm": 2.614417552947998, + "learning_rate": 4.123146106785717e-06, + "loss": 0.4386, + "num_input_tokens_seen": 13296344, + "step": 20370 + }, + { + "epoch": 12.013561320754716, + "grad_norm": 5.73816442489624, + "learning_rate": 4.1206133796079625e-06, + "loss": 0.3174, + "num_input_tokens_seen": 13300216, + "step": 20375 + }, + { + "epoch": 12.016509433962264, + "grad_norm": 2.831481456756592, + "learning_rate": 4.1180808852516065e-06, + "loss": 0.3052, + "num_input_tokens_seen": 13303832, + "step": 20380 + }, + { + "epoch": 12.01945754716981, + "grad_norm": 2.5774428844451904, + "learning_rate": 4.115548624387136e-06, + "loss": 0.3133, + "num_input_tokens_seen": 13306424, + "step": 20385 + }, + { + "epoch": 12.022405660377359, + "grad_norm": 3.81384539604187, + "learning_rate": 4.113016597684979e-06, + "loss": 0.3835, + "num_input_tokens_seen": 13309752, + "step": 20390 + }, + { + "epoch": 12.025353773584905, + "grad_norm": 3.5844943523406982, + "learning_rate": 4.110484805815502e-06, + "loss": 0.2298, + "num_input_tokens_seen": 13313528, + "step": 20395 + }, + { + "epoch": 12.028301886792454, + "grad_norm": 4.244783878326416, + "learning_rate": 4.107953249449005e-06, + "loss": 0.3466, + "num_input_tokens_seen": 13317304, + "step": 20400 + }, + { + "epoch": 12.03125, + "grad_norm": 2.51422119140625, + "learning_rate": 4.105421929255729e-06, + "loss": 0.3609, + "num_input_tokens_seen": 13320536, + "step": 20405 + }, + { + "epoch": 12.034198113207546, + "grad_norm": 3.5618269443511963, + "learning_rate": 4.102890845905854e-06, + "loss": 0.382, + "num_input_tokens_seen": 13323896, + "step": 20410 + }, + { + "epoch": 12.037146226415095, + "grad_norm": 3.4842803478240967, + "learning_rate": 4.1003600000694935e-06, + "loss": 0.2472, + "num_input_tokens_seen": 13328152, + "step": 20415 + }, + { + "epoch": 12.040094339622641, + "grad_norm": 2.1029341220855713, + "learning_rate": 4.0978293924167e-06, + "loss": 0.2954, + "num_input_tokens_seen": 13331448, + "step": 20420 + }, + { + "epoch": 12.04304245283019, + "grad_norm": 3.15633225440979, + "learning_rate": 4.095299023617461e-06, + "loss": 0.2799, + "num_input_tokens_seen": 13335736, + "step": 20425 + }, + { + "epoch": 12.045990566037736, + "grad_norm": 1.4970883131027222, + "learning_rate": 4.092768894341707e-06, + "loss": 0.3648, + "num_input_tokens_seen": 13338776, + "step": 20430 + }, + { + "epoch": 12.048938679245284, + "grad_norm": 4.482364654541016, + "learning_rate": 4.090239005259298e-06, + "loss": 0.3692, + "num_input_tokens_seen": 13341656, + "step": 20435 + }, + { + "epoch": 12.05188679245283, + "grad_norm": 3.632020950317383, + "learning_rate": 4.087709357040033e-06, + "loss": 0.4255, + "num_input_tokens_seen": 13344376, + "step": 20440 + }, + { + "epoch": 12.054834905660377, + "grad_norm": 8.917466163635254, + "learning_rate": 4.085179950353648e-06, + "loss": 0.415, + "num_input_tokens_seen": 13349592, + "step": 20445 + }, + { + "epoch": 12.057783018867925, + "grad_norm": 3.203906536102295, + "learning_rate": 4.0826507858698135e-06, + "loss": 0.348, + "num_input_tokens_seen": 13352632, + "step": 20450 + }, + { + "epoch": 12.060731132075471, + "grad_norm": 2.2375380992889404, + "learning_rate": 4.080121864258136e-06, + "loss": 0.4618, + "num_input_tokens_seen": 13356184, + "step": 20455 + }, + { + "epoch": 12.06367924528302, + "grad_norm": 3.8850653171539307, + "learning_rate": 4.077593186188161e-06, + "loss": 0.2893, + "num_input_tokens_seen": 13359448, + "step": 20460 + }, + { + "epoch": 12.066627358490566, + "grad_norm": 3.051764726638794, + "learning_rate": 4.075064752329364e-06, + "loss": 0.4575, + "num_input_tokens_seen": 13362136, + "step": 20465 + }, + { + "epoch": 12.069575471698114, + "grad_norm": 4.132770538330078, + "learning_rate": 4.0725365633511605e-06, + "loss": 0.3701, + "num_input_tokens_seen": 13364952, + "step": 20470 + }, + { + "epoch": 12.07252358490566, + "grad_norm": 3.0670416355133057, + "learning_rate": 4.070008619922899e-06, + "loss": 0.4149, + "num_input_tokens_seen": 13368504, + "step": 20475 + }, + { + "epoch": 12.075471698113208, + "grad_norm": 3.4873197078704834, + "learning_rate": 4.067480922713864e-06, + "loss": 0.4641, + "num_input_tokens_seen": 13371704, + "step": 20480 + }, + { + "epoch": 12.078419811320755, + "grad_norm": 5.753536224365234, + "learning_rate": 4.064953472393273e-06, + "loss": 0.359, + "num_input_tokens_seen": 13374904, + "step": 20485 + }, + { + "epoch": 12.081367924528301, + "grad_norm": 1.3289717435836792, + "learning_rate": 4.06242626963028e-06, + "loss": 0.3922, + "num_input_tokens_seen": 13378296, + "step": 20490 + }, + { + "epoch": 12.08431603773585, + "grad_norm": 2.2111828327178955, + "learning_rate": 4.059899315093972e-06, + "loss": 0.2894, + "num_input_tokens_seen": 13380696, + "step": 20495 + }, + { + "epoch": 12.087264150943396, + "grad_norm": 4.29170036315918, + "learning_rate": 4.057372609453374e-06, + "loss": 0.2079, + "num_input_tokens_seen": 13383512, + "step": 20500 + }, + { + "epoch": 12.090212264150944, + "grad_norm": 4.303058624267578, + "learning_rate": 4.054846153377439e-06, + "loss": 0.2893, + "num_input_tokens_seen": 13386072, + "step": 20505 + }, + { + "epoch": 12.09316037735849, + "grad_norm": 2.777421712875366, + "learning_rate": 4.052319947535058e-06, + "loss": 0.328, + "num_input_tokens_seen": 13389496, + "step": 20510 + }, + { + "epoch": 12.096108490566039, + "grad_norm": 2.4721410274505615, + "learning_rate": 4.049793992595056e-06, + "loss": 0.2193, + "num_input_tokens_seen": 13392824, + "step": 20515 + }, + { + "epoch": 12.099056603773585, + "grad_norm": 6.346024036407471, + "learning_rate": 4.047268289226187e-06, + "loss": 0.434, + "num_input_tokens_seen": 13396248, + "step": 20520 + }, + { + "epoch": 12.102004716981131, + "grad_norm": 3.03840970993042, + "learning_rate": 4.044742838097147e-06, + "loss": 0.3271, + "num_input_tokens_seen": 13400696, + "step": 20525 + }, + { + "epoch": 12.10495283018868, + "grad_norm": 3.6480863094329834, + "learning_rate": 4.0422176398765564e-06, + "loss": 0.3603, + "num_input_tokens_seen": 13403416, + "step": 20530 + }, + { + "epoch": 12.107900943396226, + "grad_norm": 2.697359561920166, + "learning_rate": 4.039692695232975e-06, + "loss": 0.3258, + "num_input_tokens_seen": 13406232, + "step": 20535 + }, + { + "epoch": 12.110849056603774, + "grad_norm": 4.425070285797119, + "learning_rate": 4.03716800483489e-06, + "loss": 0.4166, + "num_input_tokens_seen": 13409592, + "step": 20540 + }, + { + "epoch": 12.11379716981132, + "grad_norm": 5.141991138458252, + "learning_rate": 4.034643569350726e-06, + "loss": 0.4068, + "num_input_tokens_seen": 13412600, + "step": 20545 + }, + { + "epoch": 12.116745283018869, + "grad_norm": 1.7947977781295776, + "learning_rate": 4.032119389448837e-06, + "loss": 0.3674, + "num_input_tokens_seen": 13415704, + "step": 20550 + }, + { + "epoch": 12.119693396226415, + "grad_norm": 3.791771173477173, + "learning_rate": 4.0295954657975115e-06, + "loss": 0.3078, + "num_input_tokens_seen": 13418488, + "step": 20555 + }, + { + "epoch": 12.122641509433961, + "grad_norm": 3.4496617317199707, + "learning_rate": 4.027071799064968e-06, + "loss": 0.4106, + "num_input_tokens_seen": 13422328, + "step": 20560 + }, + { + "epoch": 12.12558962264151, + "grad_norm": 3.7593331336975098, + "learning_rate": 4.02454838991936e-06, + "loss": 0.4722, + "num_input_tokens_seen": 13425624, + "step": 20565 + }, + { + "epoch": 12.128537735849056, + "grad_norm": 4.181211471557617, + "learning_rate": 4.022025239028768e-06, + "loss": 0.4135, + "num_input_tokens_seen": 13428760, + "step": 20570 + }, + { + "epoch": 12.131485849056604, + "grad_norm": 6.438934326171875, + "learning_rate": 4.0195023470612095e-06, + "loss": 0.3318, + "num_input_tokens_seen": 13431224, + "step": 20575 + }, + { + "epoch": 12.13443396226415, + "grad_norm": 5.056939601898193, + "learning_rate": 4.016979714684631e-06, + "loss": 0.3496, + "num_input_tokens_seen": 13433464, + "step": 20580 + }, + { + "epoch": 12.137382075471699, + "grad_norm": 4.1657819747924805, + "learning_rate": 4.014457342566909e-06, + "loss": 0.3823, + "num_input_tokens_seen": 13436280, + "step": 20585 + }, + { + "epoch": 12.140330188679245, + "grad_norm": 3.3484060764312744, + "learning_rate": 4.011935231375853e-06, + "loss": 0.2917, + "num_input_tokens_seen": 13438808, + "step": 20590 + }, + { + "epoch": 12.143278301886792, + "grad_norm": 3.923607349395752, + "learning_rate": 4.009413381779203e-06, + "loss": 0.4555, + "num_input_tokens_seen": 13441592, + "step": 20595 + }, + { + "epoch": 12.14622641509434, + "grad_norm": 2.997964859008789, + "learning_rate": 4.00689179444463e-06, + "loss": 0.3184, + "num_input_tokens_seen": 13444728, + "step": 20600 + }, + { + "epoch": 12.149174528301886, + "grad_norm": 2.206421136856079, + "learning_rate": 4.004370470039733e-06, + "loss": 0.335, + "num_input_tokens_seen": 13448280, + "step": 20605 + }, + { + "epoch": 12.152122641509434, + "grad_norm": 2.3633475303649902, + "learning_rate": 4.001849409232046e-06, + "loss": 0.3919, + "num_input_tokens_seen": 13451256, + "step": 20610 + }, + { + "epoch": 12.15507075471698, + "grad_norm": 2.888762950897217, + "learning_rate": 3.9993286126890274e-06, + "loss": 0.2797, + "num_input_tokens_seen": 13454232, + "step": 20615 + }, + { + "epoch": 12.158018867924529, + "grad_norm": 2.391286849975586, + "learning_rate": 3.996808081078074e-06, + "loss": 0.3011, + "num_input_tokens_seen": 13457464, + "step": 20620 + }, + { + "epoch": 12.160966981132075, + "grad_norm": 3.058185338973999, + "learning_rate": 3.9942878150665035e-06, + "loss": 0.2561, + "num_input_tokens_seen": 13460728, + "step": 20625 + }, + { + "epoch": 12.163915094339623, + "grad_norm": 2.7282187938690186, + "learning_rate": 3.991767815321569e-06, + "loss": 0.3655, + "num_input_tokens_seen": 13464280, + "step": 20630 + }, + { + "epoch": 12.16686320754717, + "grad_norm": 2.987643003463745, + "learning_rate": 3.9892480825104504e-06, + "loss": 0.4409, + "num_input_tokens_seen": 13467640, + "step": 20635 + }, + { + "epoch": 12.169811320754716, + "grad_norm": 5.0638604164123535, + "learning_rate": 3.986728617300257e-06, + "loss": 0.3553, + "num_input_tokens_seen": 13470424, + "step": 20640 + }, + { + "epoch": 12.172759433962264, + "grad_norm": 2.9158055782318115, + "learning_rate": 3.984209420358031e-06, + "loss": 0.3627, + "num_input_tokens_seen": 13473240, + "step": 20645 + }, + { + "epoch": 12.17570754716981, + "grad_norm": 3.2295644283294678, + "learning_rate": 3.981690492350738e-06, + "loss": 0.4265, + "num_input_tokens_seen": 13476984, + "step": 20650 + }, + { + "epoch": 12.178655660377359, + "grad_norm": 3.7602219581604004, + "learning_rate": 3.979171833945276e-06, + "loss": 0.3657, + "num_input_tokens_seen": 13480632, + "step": 20655 + }, + { + "epoch": 12.181603773584905, + "grad_norm": 3.4003725051879883, + "learning_rate": 3.97665344580847e-06, + "loss": 0.4206, + "num_input_tokens_seen": 13484696, + "step": 20660 + }, + { + "epoch": 12.184551886792454, + "grad_norm": 2.5987823009490967, + "learning_rate": 3.974135328607075e-06, + "loss": 0.2996, + "num_input_tokens_seen": 13488056, + "step": 20665 + }, + { + "epoch": 12.1875, + "grad_norm": 3.8917205333709717, + "learning_rate": 3.971617483007773e-06, + "loss": 0.3679, + "num_input_tokens_seen": 13491576, + "step": 20670 + }, + { + "epoch": 12.190448113207546, + "grad_norm": 4.310510158538818, + "learning_rate": 3.969099909677174e-06, + "loss": 0.3933, + "num_input_tokens_seen": 13493816, + "step": 20675 + }, + { + "epoch": 12.193396226415095, + "grad_norm": 4.217745780944824, + "learning_rate": 3.9665826092818165e-06, + "loss": 0.2716, + "num_input_tokens_seen": 13496600, + "step": 20680 + }, + { + "epoch": 12.196344339622641, + "grad_norm": 2.611581802368164, + "learning_rate": 3.964065582488168e-06, + "loss": 0.3145, + "num_input_tokens_seen": 13499608, + "step": 20685 + }, + { + "epoch": 12.19929245283019, + "grad_norm": 4.735732078552246, + "learning_rate": 3.961548829962622e-06, + "loss": 0.3863, + "num_input_tokens_seen": 13502712, + "step": 20690 + }, + { + "epoch": 12.202240566037736, + "grad_norm": 3.4904375076293945, + "learning_rate": 3.959032352371496e-06, + "loss": 0.3898, + "num_input_tokens_seen": 13505848, + "step": 20695 + }, + { + "epoch": 12.205188679245284, + "grad_norm": 2.2928225994110107, + "learning_rate": 3.956516150381043e-06, + "loss": 0.3046, + "num_input_tokens_seen": 13508344, + "step": 20700 + }, + { + "epoch": 12.20813679245283, + "grad_norm": 4.314245223999023, + "learning_rate": 3.954000224657436e-06, + "loss": 0.3733, + "num_input_tokens_seen": 13510840, + "step": 20705 + }, + { + "epoch": 12.211084905660377, + "grad_norm": 2.632591962814331, + "learning_rate": 3.951484575866776e-06, + "loss": 0.2855, + "num_input_tokens_seen": 13514072, + "step": 20710 + }, + { + "epoch": 12.214033018867925, + "grad_norm": 2.3361353874206543, + "learning_rate": 3.948969204675096e-06, + "loss": 0.3604, + "num_input_tokens_seen": 13516856, + "step": 20715 + }, + { + "epoch": 12.216981132075471, + "grad_norm": 2.4873592853546143, + "learning_rate": 3.946454111748346e-06, + "loss": 0.3176, + "num_input_tokens_seen": 13521144, + "step": 20720 + }, + { + "epoch": 12.21992924528302, + "grad_norm": 3.214561700820923, + "learning_rate": 3.943939297752413e-06, + "loss": 0.3252, + "num_input_tokens_seen": 13525336, + "step": 20725 + }, + { + "epoch": 12.222877358490566, + "grad_norm": 6.3854169845581055, + "learning_rate": 3.9414247633531e-06, + "loss": 0.4091, + "num_input_tokens_seen": 13528312, + "step": 20730 + }, + { + "epoch": 12.225825471698114, + "grad_norm": 4.805942535400391, + "learning_rate": 3.9389105092161454e-06, + "loss": 0.3953, + "num_input_tokens_seen": 13530616, + "step": 20735 + }, + { + "epoch": 12.22877358490566, + "grad_norm": 2.2574925422668457, + "learning_rate": 3.936396536007205e-06, + "loss": 0.3276, + "num_input_tokens_seen": 13534328, + "step": 20740 + }, + { + "epoch": 12.231721698113208, + "grad_norm": 2.8402884006500244, + "learning_rate": 3.933882844391866e-06, + "loss": 0.3409, + "num_input_tokens_seen": 13537336, + "step": 20745 + }, + { + "epoch": 12.234669811320755, + "grad_norm": 2.713965892791748, + "learning_rate": 3.931369435035639e-06, + "loss": 0.294, + "num_input_tokens_seen": 13539800, + "step": 20750 + }, + { + "epoch": 12.237617924528301, + "grad_norm": 4.26702356338501, + "learning_rate": 3.92885630860396e-06, + "loss": 0.3727, + "num_input_tokens_seen": 13542680, + "step": 20755 + }, + { + "epoch": 12.24056603773585, + "grad_norm": 4.106425762176514, + "learning_rate": 3.926343465762189e-06, + "loss": 0.392, + "num_input_tokens_seen": 13545080, + "step": 20760 + }, + { + "epoch": 12.243514150943396, + "grad_norm": 1.8896358013153076, + "learning_rate": 3.923830907175613e-06, + "loss": 0.3523, + "num_input_tokens_seen": 13548984, + "step": 20765 + }, + { + "epoch": 12.246462264150944, + "grad_norm": 2.64504337310791, + "learning_rate": 3.921318633509442e-06, + "loss": 0.2164, + "num_input_tokens_seen": 13552312, + "step": 20770 + }, + { + "epoch": 12.24941037735849, + "grad_norm": 2.726611614227295, + "learning_rate": 3.918806645428811e-06, + "loss": 0.3877, + "num_input_tokens_seen": 13556600, + "step": 20775 + }, + { + "epoch": 12.252358490566039, + "grad_norm": 3.683722972869873, + "learning_rate": 3.916294943598781e-06, + "loss": 0.4143, + "num_input_tokens_seen": 13559448, + "step": 20780 + }, + { + "epoch": 12.255306603773585, + "grad_norm": 2.6430466175079346, + "learning_rate": 3.913783528684336e-06, + "loss": 0.4822, + "num_input_tokens_seen": 13562936, + "step": 20785 + }, + { + "epoch": 12.258254716981131, + "grad_norm": 3.876765489578247, + "learning_rate": 3.911272401350381e-06, + "loss": 0.3081, + "num_input_tokens_seen": 13566328, + "step": 20790 + }, + { + "epoch": 12.26120283018868, + "grad_norm": 4.667914867401123, + "learning_rate": 3.9087615622617525e-06, + "loss": 0.3079, + "num_input_tokens_seen": 13569592, + "step": 20795 + }, + { + "epoch": 12.264150943396226, + "grad_norm": 3.3990318775177, + "learning_rate": 3.906251012083202e-06, + "loss": 0.3767, + "num_input_tokens_seen": 13572760, + "step": 20800 + }, + { + "epoch": 12.267099056603774, + "grad_norm": 4.712822437286377, + "learning_rate": 3.9037407514794085e-06, + "loss": 0.353, + "num_input_tokens_seen": 13576184, + "step": 20805 + }, + { + "epoch": 12.27004716981132, + "grad_norm": 3.084927797317505, + "learning_rate": 3.901230781114976e-06, + "loss": 0.4948, + "num_input_tokens_seen": 13579512, + "step": 20810 + }, + { + "epoch": 12.272995283018869, + "grad_norm": 4.119962215423584, + "learning_rate": 3.898721101654431e-06, + "loss": 0.3364, + "num_input_tokens_seen": 13582680, + "step": 20815 + }, + { + "epoch": 12.275943396226415, + "grad_norm": 3.716001272201538, + "learning_rate": 3.896211713762221e-06, + "loss": 0.3751, + "num_input_tokens_seen": 13585976, + "step": 20820 + }, + { + "epoch": 12.278891509433961, + "grad_norm": 4.973601818084717, + "learning_rate": 3.893702618102715e-06, + "loss": 0.4929, + "num_input_tokens_seen": 13588728, + "step": 20825 + }, + { + "epoch": 12.28183962264151, + "grad_norm": 2.5675604343414307, + "learning_rate": 3.891193815340211e-06, + "loss": 0.3092, + "num_input_tokens_seen": 13591512, + "step": 20830 + }, + { + "epoch": 12.284787735849056, + "grad_norm": 2.8471450805664062, + "learning_rate": 3.888685306138922e-06, + "loss": 0.4348, + "num_input_tokens_seen": 13595352, + "step": 20835 + }, + { + "epoch": 12.287735849056604, + "grad_norm": 4.959554195404053, + "learning_rate": 3.886177091162987e-06, + "loss": 0.3387, + "num_input_tokens_seen": 13599064, + "step": 20840 + }, + { + "epoch": 12.29068396226415, + "grad_norm": 1.6358941793441772, + "learning_rate": 3.883669171076468e-06, + "loss": 0.2732, + "num_input_tokens_seen": 13602296, + "step": 20845 + }, + { + "epoch": 12.293632075471699, + "grad_norm": 2.0686709880828857, + "learning_rate": 3.881161546543348e-06, + "loss": 0.3853, + "num_input_tokens_seen": 13605720, + "step": 20850 + }, + { + "epoch": 12.296580188679245, + "grad_norm": 9.706639289855957, + "learning_rate": 3.8786542182275295e-06, + "loss": 0.4167, + "num_input_tokens_seen": 13608664, + "step": 20855 + }, + { + "epoch": 12.299528301886792, + "grad_norm": 2.397244930267334, + "learning_rate": 3.87614718679284e-06, + "loss": 0.409, + "num_input_tokens_seen": 13611992, + "step": 20860 + }, + { + "epoch": 12.30247641509434, + "grad_norm": 2.125706672668457, + "learning_rate": 3.8736404529030255e-06, + "loss": 0.4036, + "num_input_tokens_seen": 13616184, + "step": 20865 + }, + { + "epoch": 12.305424528301886, + "grad_norm": 3.891131639480591, + "learning_rate": 3.871134017221756e-06, + "loss": 0.3698, + "num_input_tokens_seen": 13619096, + "step": 20870 + }, + { + "epoch": 12.308372641509434, + "grad_norm": 2.491774559020996, + "learning_rate": 3.868627880412621e-06, + "loss": 0.481, + "num_input_tokens_seen": 13622008, + "step": 20875 + }, + { + "epoch": 12.31132075471698, + "grad_norm": 2.9624111652374268, + "learning_rate": 3.86612204313913e-06, + "loss": 0.277, + "num_input_tokens_seen": 13625976, + "step": 20880 + }, + { + "epoch": 12.314268867924529, + "grad_norm": 2.9760162830352783, + "learning_rate": 3.863616506064714e-06, + "loss": 0.3385, + "num_input_tokens_seen": 13629272, + "step": 20885 + }, + { + "epoch": 12.317216981132075, + "grad_norm": 3.2171952724456787, + "learning_rate": 3.861111269852727e-06, + "loss": 0.2594, + "num_input_tokens_seen": 13632376, + "step": 20890 + }, + { + "epoch": 12.320165094339623, + "grad_norm": 2.1058924198150635, + "learning_rate": 3.858606335166439e-06, + "loss": 0.337, + "num_input_tokens_seen": 13635352, + "step": 20895 + }, + { + "epoch": 12.32311320754717, + "grad_norm": 2.2584102153778076, + "learning_rate": 3.8561017026690415e-06, + "loss": 0.3032, + "num_input_tokens_seen": 13638456, + "step": 20900 + }, + { + "epoch": 12.326061320754716, + "grad_norm": 3.1981778144836426, + "learning_rate": 3.8535973730236495e-06, + "loss": 0.3073, + "num_input_tokens_seen": 13640856, + "step": 20905 + }, + { + "epoch": 12.329009433962264, + "grad_norm": 1.7064399719238281, + "learning_rate": 3.851093346893294e-06, + "loss": 0.3552, + "num_input_tokens_seen": 13644152, + "step": 20910 + }, + { + "epoch": 12.33195754716981, + "grad_norm": 3.0838658809661865, + "learning_rate": 3.848589624940927e-06, + "loss": 0.3963, + "num_input_tokens_seen": 13647320, + "step": 20915 + }, + { + "epoch": 12.334905660377359, + "grad_norm": 4.527851581573486, + "learning_rate": 3.846086207829417e-06, + "loss": 0.3738, + "num_input_tokens_seen": 13650328, + "step": 20920 + }, + { + "epoch": 12.337853773584905, + "grad_norm": 3.5663979053497314, + "learning_rate": 3.843583096221559e-06, + "loss": 0.4254, + "num_input_tokens_seen": 13653688, + "step": 20925 + }, + { + "epoch": 12.340801886792454, + "grad_norm": 3.951063632965088, + "learning_rate": 3.8410802907800596e-06, + "loss": 0.4677, + "num_input_tokens_seen": 13657048, + "step": 20930 + }, + { + "epoch": 12.34375, + "grad_norm": 3.8893046379089355, + "learning_rate": 3.838577792167546e-06, + "loss": 0.4233, + "num_input_tokens_seen": 13660920, + "step": 20935 + }, + { + "epoch": 12.346698113207546, + "grad_norm": 6.41038179397583, + "learning_rate": 3.836075601046569e-06, + "loss": 0.395, + "num_input_tokens_seen": 13663896, + "step": 20940 + }, + { + "epoch": 12.349646226415095, + "grad_norm": 2.8852298259735107, + "learning_rate": 3.833573718079594e-06, + "loss": 0.4834, + "num_input_tokens_seen": 13667192, + "step": 20945 + }, + { + "epoch": 12.352594339622641, + "grad_norm": 3.619802951812744, + "learning_rate": 3.831072143929002e-06, + "loss": 0.3511, + "num_input_tokens_seen": 13669976, + "step": 20950 + }, + { + "epoch": 12.35554245283019, + "grad_norm": 2.214292049407959, + "learning_rate": 3.828570879257098e-06, + "loss": 0.4502, + "num_input_tokens_seen": 13672728, + "step": 20955 + }, + { + "epoch": 12.358490566037736, + "grad_norm": 3.0870726108551025, + "learning_rate": 3.826069924726102e-06, + "loss": 0.3107, + "num_input_tokens_seen": 13676056, + "step": 20960 + }, + { + "epoch": 12.361438679245284, + "grad_norm": 4.07956600189209, + "learning_rate": 3.823569280998154e-06, + "loss": 0.4432, + "num_input_tokens_seen": 13679032, + "step": 20965 + }, + { + "epoch": 12.36438679245283, + "grad_norm": 4.918158531188965, + "learning_rate": 3.8210689487353095e-06, + "loss": 0.3114, + "num_input_tokens_seen": 13682040, + "step": 20970 + }, + { + "epoch": 12.367334905660377, + "grad_norm": 2.5436489582061768, + "learning_rate": 3.818568928599539e-06, + "loss": 0.371, + "num_input_tokens_seen": 13684984, + "step": 20975 + }, + { + "epoch": 12.370283018867925, + "grad_norm": 3.9167094230651855, + "learning_rate": 3.816069221252739e-06, + "loss": 0.2964, + "num_input_tokens_seen": 13687832, + "step": 20980 + }, + { + "epoch": 12.373231132075471, + "grad_norm": 3.0221219062805176, + "learning_rate": 3.813569827356715e-06, + "loss": 0.2916, + "num_input_tokens_seen": 13692472, + "step": 20985 + }, + { + "epoch": 12.37617924528302, + "grad_norm": 3.067626953125, + "learning_rate": 3.811070747573191e-06, + "loss": 0.2198, + "num_input_tokens_seen": 13698392, + "step": 20990 + }, + { + "epoch": 12.379127358490566, + "grad_norm": 2.4305059909820557, + "learning_rate": 3.8085719825638098e-06, + "loss": 0.302, + "num_input_tokens_seen": 13702424, + "step": 20995 + }, + { + "epoch": 12.382075471698114, + "grad_norm": 2.4149930477142334, + "learning_rate": 3.806073532990132e-06, + "loss": 0.3146, + "num_input_tokens_seen": 13705240, + "step": 21000 + }, + { + "epoch": 12.38502358490566, + "grad_norm": 2.159437894821167, + "learning_rate": 3.8035753995136333e-06, + "loss": 0.3507, + "num_input_tokens_seen": 13708664, + "step": 21005 + }, + { + "epoch": 12.387971698113208, + "grad_norm": 3.321687936782837, + "learning_rate": 3.801077582795702e-06, + "loss": 0.2927, + "num_input_tokens_seen": 13711992, + "step": 21010 + }, + { + "epoch": 12.390919811320755, + "grad_norm": 2.6007816791534424, + "learning_rate": 3.7985800834976494e-06, + "loss": 0.2707, + "num_input_tokens_seen": 13714648, + "step": 21015 + }, + { + "epoch": 12.393867924528301, + "grad_norm": 2.081460952758789, + "learning_rate": 3.7960829022806965e-06, + "loss": 0.2912, + "num_input_tokens_seen": 13717880, + "step": 21020 + }, + { + "epoch": 12.39681603773585, + "grad_norm": 5.388912200927734, + "learning_rate": 3.793586039805984e-06, + "loss": 0.3051, + "num_input_tokens_seen": 13720600, + "step": 21025 + }, + { + "epoch": 12.399764150943396, + "grad_norm": 2.776482105255127, + "learning_rate": 3.791089496734567e-06, + "loss": 0.3643, + "num_input_tokens_seen": 13723800, + "step": 21030 + }, + { + "epoch": 12.402712264150944, + "grad_norm": 4.252889156341553, + "learning_rate": 3.7885932737274163e-06, + "loss": 0.3376, + "num_input_tokens_seen": 13727032, + "step": 21035 + }, + { + "epoch": 12.40566037735849, + "grad_norm": 5.593424320220947, + "learning_rate": 3.7860973714454156e-06, + "loss": 0.3379, + "num_input_tokens_seen": 13730040, + "step": 21040 + }, + { + "epoch": 12.408608490566039, + "grad_norm": 2.809781074523926, + "learning_rate": 3.7836017905493695e-06, + "loss": 0.3424, + "num_input_tokens_seen": 13733560, + "step": 21045 + }, + { + "epoch": 12.411556603773585, + "grad_norm": 2.951747179031372, + "learning_rate": 3.7811065316999908e-06, + "loss": 0.3554, + "num_input_tokens_seen": 13737080, + "step": 21050 + }, + { + "epoch": 12.414504716981131, + "grad_norm": 1.792353630065918, + "learning_rate": 3.7786115955579105e-06, + "loss": 0.4199, + "num_input_tokens_seen": 13741432, + "step": 21055 + }, + { + "epoch": 12.41745283018868, + "grad_norm": 1.9442509412765503, + "learning_rate": 3.7761169827836746e-06, + "loss": 0.3953, + "num_input_tokens_seen": 13749272, + "step": 21060 + }, + { + "epoch": 12.420400943396226, + "grad_norm": 3.4836626052856445, + "learning_rate": 3.773622694037743e-06, + "loss": 0.3446, + "num_input_tokens_seen": 13752312, + "step": 21065 + }, + { + "epoch": 12.423349056603774, + "grad_norm": 2.4343535900115967, + "learning_rate": 3.7711287299804865e-06, + "loss": 0.4225, + "num_input_tokens_seen": 13756088, + "step": 21070 + }, + { + "epoch": 12.42629716981132, + "grad_norm": 3.199510335922241, + "learning_rate": 3.768635091272197e-06, + "loss": 0.3629, + "num_input_tokens_seen": 13759192, + "step": 21075 + }, + { + "epoch": 12.429245283018869, + "grad_norm": 2.261979579925537, + "learning_rate": 3.7661417785730732e-06, + "loss": 0.3697, + "num_input_tokens_seen": 13762392, + "step": 21080 + }, + { + "epoch": 12.432193396226415, + "grad_norm": 4.748734474182129, + "learning_rate": 3.7636487925432304e-06, + "loss": 0.4001, + "num_input_tokens_seen": 13765560, + "step": 21085 + }, + { + "epoch": 12.435141509433961, + "grad_norm": 2.526278018951416, + "learning_rate": 3.761156133842697e-06, + "loss": 0.3054, + "num_input_tokens_seen": 13768792, + "step": 21090 + }, + { + "epoch": 12.43808962264151, + "grad_norm": 2.734201669692993, + "learning_rate": 3.7586638031314182e-06, + "loss": 0.2721, + "num_input_tokens_seen": 13771544, + "step": 21095 + }, + { + "epoch": 12.441037735849056, + "grad_norm": 2.3232436180114746, + "learning_rate": 3.7561718010692477e-06, + "loss": 0.2734, + "num_input_tokens_seen": 13774264, + "step": 21100 + }, + { + "epoch": 12.443985849056604, + "grad_norm": 4.892324447631836, + "learning_rate": 3.7536801283159523e-06, + "loss": 0.45, + "num_input_tokens_seen": 13777176, + "step": 21105 + }, + { + "epoch": 12.44693396226415, + "grad_norm": 4.02307653427124, + "learning_rate": 3.7511887855312155e-06, + "loss": 0.3517, + "num_input_tokens_seen": 13779640, + "step": 21110 + }, + { + "epoch": 12.449882075471699, + "grad_norm": 3.155975341796875, + "learning_rate": 3.74869777337463e-06, + "loss": 0.2688, + "num_input_tokens_seen": 13782872, + "step": 21115 + }, + { + "epoch": 12.452830188679245, + "grad_norm": 3.7623212337493896, + "learning_rate": 3.7462070925057004e-06, + "loss": 0.3517, + "num_input_tokens_seen": 13787512, + "step": 21120 + }, + { + "epoch": 12.455778301886792, + "grad_norm": 2.7868552207946777, + "learning_rate": 3.7437167435838472e-06, + "loss": 0.2963, + "num_input_tokens_seen": 13790584, + "step": 21125 + }, + { + "epoch": 12.45872641509434, + "grad_norm": 3.0571517944335938, + "learning_rate": 3.741226727268401e-06, + "loss": 0.4306, + "num_input_tokens_seen": 13793976, + "step": 21130 + }, + { + "epoch": 12.461674528301886, + "grad_norm": 5.736512660980225, + "learning_rate": 3.7387370442186027e-06, + "loss": 0.3287, + "num_input_tokens_seen": 13797528, + "step": 21135 + }, + { + "epoch": 12.464622641509434, + "grad_norm": 3.231822967529297, + "learning_rate": 3.736247695093609e-06, + "loss": 0.3129, + "num_input_tokens_seen": 13800696, + "step": 21140 + }, + { + "epoch": 12.46757075471698, + "grad_norm": 2.9698550701141357, + "learning_rate": 3.7337586805524838e-06, + "loss": 0.3624, + "num_input_tokens_seen": 13804312, + "step": 21145 + }, + { + "epoch": 12.470518867924529, + "grad_norm": 4.9800801277160645, + "learning_rate": 3.731270001254205e-06, + "loss": 0.3027, + "num_input_tokens_seen": 13807096, + "step": 21150 + }, + { + "epoch": 12.473466981132075, + "grad_norm": 2.5995781421661377, + "learning_rate": 3.728781657857661e-06, + "loss": 0.3137, + "num_input_tokens_seen": 13810744, + "step": 21155 + }, + { + "epoch": 12.476415094339623, + "grad_norm": 4.284379482269287, + "learning_rate": 3.726293651021653e-06, + "loss": 0.3676, + "num_input_tokens_seen": 13814296, + "step": 21160 + }, + { + "epoch": 12.47936320754717, + "grad_norm": 5.0136895179748535, + "learning_rate": 3.7238059814048888e-06, + "loss": 0.2998, + "num_input_tokens_seen": 13817112, + "step": 21165 + }, + { + "epoch": 12.482311320754716, + "grad_norm": 6.4987006187438965, + "learning_rate": 3.7213186496659916e-06, + "loss": 0.4281, + "num_input_tokens_seen": 13819832, + "step": 21170 + }, + { + "epoch": 12.485259433962264, + "grad_norm": 13.371336936950684, + "learning_rate": 3.718831656463493e-06, + "loss": 0.4268, + "num_input_tokens_seen": 13822712, + "step": 21175 + }, + { + "epoch": 12.48820754716981, + "grad_norm": 3.0891411304473877, + "learning_rate": 3.7163450024558345e-06, + "loss": 0.3813, + "num_input_tokens_seen": 13825624, + "step": 21180 + }, + { + "epoch": 12.491155660377359, + "grad_norm": 2.3158745765686035, + "learning_rate": 3.7138586883013694e-06, + "loss": 0.2945, + "num_input_tokens_seen": 13828440, + "step": 21185 + }, + { + "epoch": 12.494103773584905, + "grad_norm": 3.035343885421753, + "learning_rate": 3.7113727146583573e-06, + "loss": 0.325, + "num_input_tokens_seen": 13831544, + "step": 21190 + }, + { + "epoch": 12.497051886792454, + "grad_norm": 2.359293222427368, + "learning_rate": 3.7088870821849748e-06, + "loss": 0.4517, + "num_input_tokens_seen": 13834520, + "step": 21195 + }, + { + "epoch": 12.5, + "grad_norm": 3.258863925933838, + "learning_rate": 3.706401791539301e-06, + "loss": 0.4052, + "num_input_tokens_seen": 13838040, + "step": 21200 + }, + { + "epoch": 12.502948113207546, + "grad_norm": 4.702422142028809, + "learning_rate": 3.703916843379328e-06, + "loss": 0.3706, + "num_input_tokens_seen": 13841688, + "step": 21205 + }, + { + "epoch": 12.505896226415095, + "grad_norm": 4.060920238494873, + "learning_rate": 3.7014322383629575e-06, + "loss": 0.265, + "num_input_tokens_seen": 13845144, + "step": 21210 + }, + { + "epoch": 12.508844339622641, + "grad_norm": 3.204749822616577, + "learning_rate": 3.6989479771479976e-06, + "loss": 0.4326, + "num_input_tokens_seen": 13847928, + "step": 21215 + }, + { + "epoch": 12.51179245283019, + "grad_norm": 3.049386739730835, + "learning_rate": 3.696464060392169e-06, + "loss": 0.3763, + "num_input_tokens_seen": 13851320, + "step": 21220 + }, + { + "epoch": 12.514740566037736, + "grad_norm": 2.6858530044555664, + "learning_rate": 3.6939804887530962e-06, + "loss": 0.2907, + "num_input_tokens_seen": 13854520, + "step": 21225 + }, + { + "epoch": 12.517688679245284, + "grad_norm": 2.4298501014709473, + "learning_rate": 3.6914972628883196e-06, + "loss": 0.4019, + "num_input_tokens_seen": 13858648, + "step": 21230 + }, + { + "epoch": 12.52063679245283, + "grad_norm": 6.548820495605469, + "learning_rate": 3.6890143834552814e-06, + "loss": 0.347, + "num_input_tokens_seen": 13861880, + "step": 21235 + }, + { + "epoch": 12.523584905660378, + "grad_norm": 4.843287467956543, + "learning_rate": 3.6865318511113347e-06, + "loss": 0.3717, + "num_input_tokens_seen": 13864376, + "step": 21240 + }, + { + "epoch": 12.526533018867925, + "grad_norm": 2.7702043056488037, + "learning_rate": 3.684049666513742e-06, + "loss": 0.2905, + "num_input_tokens_seen": 13867288, + "step": 21245 + }, + { + "epoch": 12.529481132075471, + "grad_norm": 3.4120352268218994, + "learning_rate": 3.6815678303196715e-06, + "loss": 0.3394, + "num_input_tokens_seen": 13870264, + "step": 21250 + }, + { + "epoch": 12.53242924528302, + "grad_norm": 1.9582855701446533, + "learning_rate": 3.6790863431861988e-06, + "loss": 0.278, + "num_input_tokens_seen": 13874360, + "step": 21255 + }, + { + "epoch": 12.535377358490566, + "grad_norm": 3.2264082431793213, + "learning_rate": 3.676605205770311e-06, + "loss": 0.4347, + "num_input_tokens_seen": 13876664, + "step": 21260 + }, + { + "epoch": 12.538325471698114, + "grad_norm": 2.8540754318237305, + "learning_rate": 3.674124418728898e-06, + "loss": 0.3338, + "num_input_tokens_seen": 13880088, + "step": 21265 + }, + { + "epoch": 12.54127358490566, + "grad_norm": 5.460129737854004, + "learning_rate": 3.671643982718759e-06, + "loss": 0.3735, + "num_input_tokens_seen": 13883160, + "step": 21270 + }, + { + "epoch": 12.544221698113208, + "grad_norm": 2.6225204467773438, + "learning_rate": 3.6691638983966017e-06, + "loss": 0.3897, + "num_input_tokens_seen": 13886648, + "step": 21275 + }, + { + "epoch": 12.547169811320755, + "grad_norm": 3.329176664352417, + "learning_rate": 3.666684166419037e-06, + "loss": 0.4713, + "num_input_tokens_seen": 13889656, + "step": 21280 + }, + { + "epoch": 12.550117924528301, + "grad_norm": 2.0760645866394043, + "learning_rate": 3.6642047874425833e-06, + "loss": 0.2621, + "num_input_tokens_seen": 13893144, + "step": 21285 + }, + { + "epoch": 12.55306603773585, + "grad_norm": 3.7049195766448975, + "learning_rate": 3.661725762123671e-06, + "loss": 0.3365, + "num_input_tokens_seen": 13896056, + "step": 21290 + }, + { + "epoch": 12.556014150943396, + "grad_norm": 4.06227445602417, + "learning_rate": 3.659247091118631e-06, + "loss": 0.3294, + "num_input_tokens_seen": 13898360, + "step": 21295 + }, + { + "epoch": 12.558962264150944, + "grad_norm": 2.698856830596924, + "learning_rate": 3.6567687750837027e-06, + "loss": 0.2824, + "num_input_tokens_seen": 13901464, + "step": 21300 + }, + { + "epoch": 12.56191037735849, + "grad_norm": 5.062452793121338, + "learning_rate": 3.6542908146750287e-06, + "loss": 0.3537, + "num_input_tokens_seen": 13904504, + "step": 21305 + }, + { + "epoch": 12.564858490566039, + "grad_norm": 3.246971368789673, + "learning_rate": 3.6518132105486624e-06, + "loss": 0.2647, + "num_input_tokens_seen": 13907896, + "step": 21310 + }, + { + "epoch": 12.567806603773585, + "grad_norm": 3.4587795734405518, + "learning_rate": 3.649335963360559e-06, + "loss": 0.4073, + "num_input_tokens_seen": 13911320, + "step": 21315 + }, + { + "epoch": 12.570754716981131, + "grad_norm": 3.8048665523529053, + "learning_rate": 3.6468590737665795e-06, + "loss": 0.3035, + "num_input_tokens_seen": 13915000, + "step": 21320 + }, + { + "epoch": 12.57370283018868, + "grad_norm": 3.344170570373535, + "learning_rate": 3.6443825424224926e-06, + "loss": 0.4027, + "num_input_tokens_seen": 13917560, + "step": 21325 + }, + { + "epoch": 12.576650943396226, + "grad_norm": 3.8680036067962646, + "learning_rate": 3.64190636998397e-06, + "loss": 0.2905, + "num_input_tokens_seen": 13920120, + "step": 21330 + }, + { + "epoch": 12.579599056603774, + "grad_norm": 2.8840997219085693, + "learning_rate": 3.639430557106588e-06, + "loss": 0.3219, + "num_input_tokens_seen": 13923032, + "step": 21335 + }, + { + "epoch": 12.58254716981132, + "grad_norm": 5.633397102355957, + "learning_rate": 3.6369551044458314e-06, + "loss": 0.3395, + "num_input_tokens_seen": 13926840, + "step": 21340 + }, + { + "epoch": 12.585495283018869, + "grad_norm": 6.972689151763916, + "learning_rate": 3.6344800126570846e-06, + "loss": 0.4165, + "num_input_tokens_seen": 13929400, + "step": 21345 + }, + { + "epoch": 12.588443396226415, + "grad_norm": 6.015566825866699, + "learning_rate": 3.6320052823956385e-06, + "loss": 0.2435, + "num_input_tokens_seen": 13932120, + "step": 21350 + }, + { + "epoch": 12.591391509433961, + "grad_norm": 5.883251190185547, + "learning_rate": 3.6295309143166906e-06, + "loss": 0.234, + "num_input_tokens_seen": 13935064, + "step": 21355 + }, + { + "epoch": 12.59433962264151, + "grad_norm": 3.9262642860412598, + "learning_rate": 3.62705690907534e-06, + "loss": 0.3314, + "num_input_tokens_seen": 13938584, + "step": 21360 + }, + { + "epoch": 12.597287735849056, + "grad_norm": 2.9461798667907715, + "learning_rate": 3.624583267326588e-06, + "loss": 0.3161, + "num_input_tokens_seen": 13941656, + "step": 21365 + }, + { + "epoch": 12.600235849056604, + "grad_norm": 4.13564920425415, + "learning_rate": 3.6221099897253454e-06, + "loss": 0.3433, + "num_input_tokens_seen": 13944472, + "step": 21370 + }, + { + "epoch": 12.60318396226415, + "grad_norm": 3.43648099899292, + "learning_rate": 3.619637076926421e-06, + "loss": 0.3209, + "num_input_tokens_seen": 13947512, + "step": 21375 + }, + { + "epoch": 12.606132075471699, + "grad_norm": 5.738428115844727, + "learning_rate": 3.617164529584528e-06, + "loss": 0.3816, + "num_input_tokens_seen": 13950808, + "step": 21380 + }, + { + "epoch": 12.609080188679245, + "grad_norm": 2.4414560794830322, + "learning_rate": 3.614692348354286e-06, + "loss": 0.2666, + "num_input_tokens_seen": 13954072, + "step": 21385 + }, + { + "epoch": 12.612028301886792, + "grad_norm": 2.1865594387054443, + "learning_rate": 3.612220533890216e-06, + "loss": 0.3719, + "num_input_tokens_seen": 13957016, + "step": 21390 + }, + { + "epoch": 12.61497641509434, + "grad_norm": 1.8305792808532715, + "learning_rate": 3.609749086846741e-06, + "loss": 0.2846, + "num_input_tokens_seen": 13960024, + "step": 21395 + }, + { + "epoch": 12.617924528301886, + "grad_norm": 2.4060370922088623, + "learning_rate": 3.607278007878186e-06, + "loss": 0.3851, + "num_input_tokens_seen": 13963320, + "step": 21400 + }, + { + "epoch": 12.620872641509434, + "grad_norm": 3.3054118156433105, + "learning_rate": 3.6048072976387817e-06, + "loss": 0.3583, + "num_input_tokens_seen": 13966424, + "step": 21405 + }, + { + "epoch": 12.62382075471698, + "grad_norm": 3.4472761154174805, + "learning_rate": 3.6023369567826585e-06, + "loss": 0.3542, + "num_input_tokens_seen": 13969688, + "step": 21410 + }, + { + "epoch": 12.626768867924529, + "grad_norm": 4.033164024353027, + "learning_rate": 3.599866985963848e-06, + "loss": 0.3137, + "num_input_tokens_seen": 13975928, + "step": 21415 + }, + { + "epoch": 12.629716981132075, + "grad_norm": 3.188581943511963, + "learning_rate": 3.5973973858362885e-06, + "loss": 0.3308, + "num_input_tokens_seen": 13979128, + "step": 21420 + }, + { + "epoch": 12.632665094339622, + "grad_norm": 3.5570056438446045, + "learning_rate": 3.594928157053816e-06, + "loss": 0.3491, + "num_input_tokens_seen": 13982488, + "step": 21425 + }, + { + "epoch": 12.63561320754717, + "grad_norm": 2.1358227729797363, + "learning_rate": 3.592459300270168e-06, + "loss": 0.283, + "num_input_tokens_seen": 13986232, + "step": 21430 + }, + { + "epoch": 12.638561320754716, + "grad_norm": 3.0476815700531006, + "learning_rate": 3.589990816138988e-06, + "loss": 0.3874, + "num_input_tokens_seen": 13989016, + "step": 21435 + }, + { + "epoch": 12.641509433962264, + "grad_norm": 2.768232583999634, + "learning_rate": 3.587522705313816e-06, + "loss": 0.3531, + "num_input_tokens_seen": 13998008, + "step": 21440 + }, + { + "epoch": 12.64445754716981, + "grad_norm": 3.8550169467926025, + "learning_rate": 3.585054968448094e-06, + "loss": 0.3027, + "num_input_tokens_seen": 14001240, + "step": 21445 + }, + { + "epoch": 12.647405660377359, + "grad_norm": 3.9696741104125977, + "learning_rate": 3.5825876061951686e-06, + "loss": 0.4439, + "num_input_tokens_seen": 14004024, + "step": 21450 + }, + { + "epoch": 12.650353773584905, + "grad_norm": 2.8694238662719727, + "learning_rate": 3.5801206192082818e-06, + "loss": 0.2395, + "num_input_tokens_seen": 14006712, + "step": 21455 + }, + { + "epoch": 12.653301886792454, + "grad_norm": 4.577816486358643, + "learning_rate": 3.577654008140582e-06, + "loss": 0.3093, + "num_input_tokens_seen": 14010648, + "step": 21460 + }, + { + "epoch": 12.65625, + "grad_norm": 2.8944475650787354, + "learning_rate": 3.5751877736451123e-06, + "loss": 0.2982, + "num_input_tokens_seen": 14013080, + "step": 21465 + }, + { + "epoch": 12.659198113207546, + "grad_norm": 3.420577049255371, + "learning_rate": 3.5727219163748205e-06, + "loss": 0.3245, + "num_input_tokens_seen": 14016088, + "step": 21470 + }, + { + "epoch": 12.662146226415095, + "grad_norm": 5.421682357788086, + "learning_rate": 3.570256436982552e-06, + "loss": 0.3602, + "num_input_tokens_seen": 14018392, + "step": 21475 + }, + { + "epoch": 12.665094339622641, + "grad_norm": 2.6409735679626465, + "learning_rate": 3.5677913361210536e-06, + "loss": 0.3106, + "num_input_tokens_seen": 14021560, + "step": 21480 + }, + { + "epoch": 12.66804245283019, + "grad_norm": 4.527918815612793, + "learning_rate": 3.565326614442972e-06, + "loss": 0.3472, + "num_input_tokens_seen": 14025176, + "step": 21485 + }, + { + "epoch": 12.670990566037736, + "grad_norm": 3.998150110244751, + "learning_rate": 3.5628622726008523e-06, + "loss": 0.3692, + "num_input_tokens_seen": 14028408, + "step": 21490 + }, + { + "epoch": 12.673938679245284, + "grad_norm": 4.9621429443359375, + "learning_rate": 3.56039831124714e-06, + "loss": 0.4373, + "num_input_tokens_seen": 14032024, + "step": 21495 + }, + { + "epoch": 12.67688679245283, + "grad_norm": 4.055542469024658, + "learning_rate": 3.557934731034179e-06, + "loss": 0.27, + "num_input_tokens_seen": 14034456, + "step": 21500 + }, + { + "epoch": 12.679834905660378, + "grad_norm": 2.7810192108154297, + "learning_rate": 3.5554715326142126e-06, + "loss": 0.352, + "num_input_tokens_seen": 14038584, + "step": 21505 + }, + { + "epoch": 12.682783018867925, + "grad_norm": 4.445689678192139, + "learning_rate": 3.553008716639384e-06, + "loss": 0.2779, + "num_input_tokens_seen": 14041272, + "step": 21510 + }, + { + "epoch": 12.685731132075471, + "grad_norm": 3.112377405166626, + "learning_rate": 3.5505462837617338e-06, + "loss": 0.3029, + "num_input_tokens_seen": 14045272, + "step": 21515 + }, + { + "epoch": 12.68867924528302, + "grad_norm": 4.456690788269043, + "learning_rate": 3.5480842346332013e-06, + "loss": 0.3017, + "num_input_tokens_seen": 14048280, + "step": 21520 + }, + { + "epoch": 12.691627358490566, + "grad_norm": 3.0293211936950684, + "learning_rate": 3.5456225699056256e-06, + "loss": 0.3993, + "num_input_tokens_seen": 14052184, + "step": 21525 + }, + { + "epoch": 12.694575471698114, + "grad_norm": 3.8884997367858887, + "learning_rate": 3.5431612902307426e-06, + "loss": 0.3501, + "num_input_tokens_seen": 14055768, + "step": 21530 + }, + { + "epoch": 12.69752358490566, + "grad_norm": 3.3717899322509766, + "learning_rate": 3.540700396260186e-06, + "loss": 0.3326, + "num_input_tokens_seen": 14058072, + "step": 21535 + }, + { + "epoch": 12.700471698113208, + "grad_norm": 6.659128665924072, + "learning_rate": 3.538239888645489e-06, + "loss": 0.4521, + "num_input_tokens_seen": 14064184, + "step": 21540 + }, + { + "epoch": 12.703419811320755, + "grad_norm": 4.410452842712402, + "learning_rate": 3.535779768038082e-06, + "loss": 0.3504, + "num_input_tokens_seen": 14067736, + "step": 21545 + }, + { + "epoch": 12.706367924528301, + "grad_norm": 2.695115566253662, + "learning_rate": 3.5333200350892905e-06, + "loss": 0.2695, + "num_input_tokens_seen": 14071000, + "step": 21550 + }, + { + "epoch": 12.70931603773585, + "grad_norm": 2.7699124813079834, + "learning_rate": 3.530860690450342e-06, + "loss": 0.4768, + "num_input_tokens_seen": 14073880, + "step": 21555 + }, + { + "epoch": 12.712264150943396, + "grad_norm": 2.480238437652588, + "learning_rate": 3.528401734772357e-06, + "loss": 0.3675, + "num_input_tokens_seen": 14076440, + "step": 21560 + }, + { + "epoch": 12.715212264150944, + "grad_norm": 4.693593502044678, + "learning_rate": 3.5259431687063538e-06, + "loss": 0.3076, + "num_input_tokens_seen": 14080056, + "step": 21565 + }, + { + "epoch": 12.71816037735849, + "grad_norm": 3.145026683807373, + "learning_rate": 3.523484992903249e-06, + "loss": 0.3149, + "num_input_tokens_seen": 14082648, + "step": 21570 + }, + { + "epoch": 12.721108490566039, + "grad_norm": 3.3825626373291016, + "learning_rate": 3.5210272080138573e-06, + "loss": 0.3692, + "num_input_tokens_seen": 14084824, + "step": 21575 + }, + { + "epoch": 12.724056603773585, + "grad_norm": 6.032142162322998, + "learning_rate": 3.518569814688887e-06, + "loss": 0.3751, + "num_input_tokens_seen": 14087736, + "step": 21580 + }, + { + "epoch": 12.727004716981131, + "grad_norm": 2.63630747795105, + "learning_rate": 3.5161128135789414e-06, + "loss": 0.3009, + "num_input_tokens_seen": 14090840, + "step": 21585 + }, + { + "epoch": 12.72995283018868, + "grad_norm": 5.657066822052002, + "learning_rate": 3.513656205334525e-06, + "loss": 0.285, + "num_input_tokens_seen": 14094008, + "step": 21590 + }, + { + "epoch": 12.732900943396226, + "grad_norm": 2.8135833740234375, + "learning_rate": 3.5111999906060336e-06, + "loss": 0.3837, + "num_input_tokens_seen": 14097176, + "step": 21595 + }, + { + "epoch": 12.735849056603774, + "grad_norm": 3.0824592113494873, + "learning_rate": 3.50874417004376e-06, + "loss": 0.2791, + "num_input_tokens_seen": 14100536, + "step": 21600 + }, + { + "epoch": 12.73879716981132, + "grad_norm": 4.154845237731934, + "learning_rate": 3.5062887442978956e-06, + "loss": 0.344, + "num_input_tokens_seen": 14103576, + "step": 21605 + }, + { + "epoch": 12.741745283018869, + "grad_norm": 1.583423137664795, + "learning_rate": 3.503833714018524e-06, + "loss": 0.3228, + "num_input_tokens_seen": 14106872, + "step": 21610 + }, + { + "epoch": 12.744693396226415, + "grad_norm": 3.2842671871185303, + "learning_rate": 3.5013790798556228e-06, + "loss": 0.3708, + "num_input_tokens_seen": 14110712, + "step": 21615 + }, + { + "epoch": 12.747641509433961, + "grad_norm": 3.0496938228607178, + "learning_rate": 3.4989248424590705e-06, + "loss": 0.3733, + "num_input_tokens_seen": 14114008, + "step": 21620 + }, + { + "epoch": 12.75058962264151, + "grad_norm": 3.9015731811523438, + "learning_rate": 3.4964710024786354e-06, + "loss": 0.3742, + "num_input_tokens_seen": 14117112, + "step": 21625 + }, + { + "epoch": 12.753537735849056, + "grad_norm": 4.6696319580078125, + "learning_rate": 3.4940175605639813e-06, + "loss": 0.3433, + "num_input_tokens_seen": 14119896, + "step": 21630 + }, + { + "epoch": 12.756485849056604, + "grad_norm": 4.118602275848389, + "learning_rate": 3.4915645173646694e-06, + "loss": 0.5534, + "num_input_tokens_seen": 14122840, + "step": 21635 + }, + { + "epoch": 12.75943396226415, + "grad_norm": 7.686024188995361, + "learning_rate": 3.489111873530153e-06, + "loss": 0.4638, + "num_input_tokens_seen": 14125304, + "step": 21640 + }, + { + "epoch": 12.762382075471699, + "grad_norm": 4.889338493347168, + "learning_rate": 3.4866596297097776e-06, + "loss": 0.4115, + "num_input_tokens_seen": 14128856, + "step": 21645 + }, + { + "epoch": 12.765330188679245, + "grad_norm": 3.0874056816101074, + "learning_rate": 3.484207786552789e-06, + "loss": 0.3267, + "num_input_tokens_seen": 14132024, + "step": 21650 + }, + { + "epoch": 12.768278301886792, + "grad_norm": 2.975493907928467, + "learning_rate": 3.4817563447083214e-06, + "loss": 0.3325, + "num_input_tokens_seen": 14135192, + "step": 21655 + }, + { + "epoch": 12.77122641509434, + "grad_norm": 4.667953968048096, + "learning_rate": 3.4793053048254044e-06, + "loss": 0.3522, + "num_input_tokens_seen": 14138776, + "step": 21660 + }, + { + "epoch": 12.774174528301886, + "grad_norm": 11.657426834106445, + "learning_rate": 3.47685466755296e-06, + "loss": 0.3625, + "num_input_tokens_seen": 14142136, + "step": 21665 + }, + { + "epoch": 12.777122641509434, + "grad_norm": 4.863961219787598, + "learning_rate": 3.474404433539809e-06, + "loss": 0.3591, + "num_input_tokens_seen": 14145400, + "step": 21670 + }, + { + "epoch": 12.78007075471698, + "grad_norm": 4.171825408935547, + "learning_rate": 3.4719546034346598e-06, + "loss": 0.3448, + "num_input_tokens_seen": 14149016, + "step": 21675 + }, + { + "epoch": 12.783018867924529, + "grad_norm": 2.2246859073638916, + "learning_rate": 3.4695051778861125e-06, + "loss": 0.3569, + "num_input_tokens_seen": 14152088, + "step": 21680 + }, + { + "epoch": 12.785966981132075, + "grad_norm": 2.7536182403564453, + "learning_rate": 3.4670561575426677e-06, + "loss": 0.3916, + "num_input_tokens_seen": 14154840, + "step": 21685 + }, + { + "epoch": 12.788915094339622, + "grad_norm": 5.0762104988098145, + "learning_rate": 3.4646075430527115e-06, + "loss": 0.3309, + "num_input_tokens_seen": 14158392, + "step": 21690 + }, + { + "epoch": 12.79186320754717, + "grad_norm": 4.05437707901001, + "learning_rate": 3.4621593350645236e-06, + "loss": 0.3953, + "num_input_tokens_seen": 14161560, + "step": 21695 + }, + { + "epoch": 12.794811320754716, + "grad_norm": 3.5340235233306885, + "learning_rate": 3.4597115342262817e-06, + "loss": 0.5333, + "num_input_tokens_seen": 14165272, + "step": 21700 + }, + { + "epoch": 12.797759433962264, + "grad_norm": 2.641016721725464, + "learning_rate": 3.4572641411860484e-06, + "loss": 0.2882, + "num_input_tokens_seen": 14168312, + "step": 21705 + }, + { + "epoch": 12.80070754716981, + "grad_norm": 4.4181647300720215, + "learning_rate": 3.454817156591782e-06, + "loss": 0.335, + "num_input_tokens_seen": 14171544, + "step": 21710 + }, + { + "epoch": 12.803655660377359, + "grad_norm": 3.4987332820892334, + "learning_rate": 3.4523705810913344e-06, + "loss": 0.3581, + "num_input_tokens_seen": 14174008, + "step": 21715 + }, + { + "epoch": 12.806603773584905, + "grad_norm": 8.13829517364502, + "learning_rate": 3.449924415332443e-06, + "loss": 0.298, + "num_input_tokens_seen": 14177560, + "step": 21720 + }, + { + "epoch": 12.809551886792454, + "grad_norm": 3.8514766693115234, + "learning_rate": 3.447478659962745e-06, + "loss": 0.3413, + "num_input_tokens_seen": 14180632, + "step": 21725 + }, + { + "epoch": 12.8125, + "grad_norm": 4.029857158660889, + "learning_rate": 3.4450333156297625e-06, + "loss": 0.4951, + "num_input_tokens_seen": 14183960, + "step": 21730 + }, + { + "epoch": 12.815448113207546, + "grad_norm": 4.088692665100098, + "learning_rate": 3.44258838298091e-06, + "loss": 0.3659, + "num_input_tokens_seen": 14186616, + "step": 21735 + }, + { + "epoch": 12.818396226415095, + "grad_norm": 3.341370105743408, + "learning_rate": 3.440143862663497e-06, + "loss": 0.3283, + "num_input_tokens_seen": 14189688, + "step": 21740 + }, + { + "epoch": 12.821344339622641, + "grad_norm": 4.7779412269592285, + "learning_rate": 3.4376997553247183e-06, + "loss": 0.2844, + "num_input_tokens_seen": 14192984, + "step": 21745 + }, + { + "epoch": 12.82429245283019, + "grad_norm": 3.5784049034118652, + "learning_rate": 3.4352560616116617e-06, + "loss": 0.3077, + "num_input_tokens_seen": 14196856, + "step": 21750 + }, + { + "epoch": 12.827240566037736, + "grad_norm": 3.40222430229187, + "learning_rate": 3.4328127821713077e-06, + "loss": 0.5122, + "num_input_tokens_seen": 14199736, + "step": 21755 + }, + { + "epoch": 12.830188679245284, + "grad_norm": 3.3480069637298584, + "learning_rate": 3.430369917650521e-06, + "loss": 0.301, + "num_input_tokens_seen": 14202872, + "step": 21760 + }, + { + "epoch": 12.83313679245283, + "grad_norm": 2.4822418689727783, + "learning_rate": 3.427927468696066e-06, + "loss": 0.3869, + "num_input_tokens_seen": 14206232, + "step": 21765 + }, + { + "epoch": 12.836084905660378, + "grad_norm": 2.68818736076355, + "learning_rate": 3.425485435954588e-06, + "loss": 0.3546, + "num_input_tokens_seen": 14209464, + "step": 21770 + }, + { + "epoch": 12.839033018867925, + "grad_norm": 3.985126256942749, + "learning_rate": 3.4230438200726274e-06, + "loss": 0.3767, + "num_input_tokens_seen": 14212920, + "step": 21775 + }, + { + "epoch": 12.841981132075471, + "grad_norm": 2.7161736488342285, + "learning_rate": 3.4206026216966113e-06, + "loss": 0.356, + "num_input_tokens_seen": 14215480, + "step": 21780 + }, + { + "epoch": 12.84492924528302, + "grad_norm": 2.8949382305145264, + "learning_rate": 3.418161841472858e-06, + "loss": 0.4151, + "num_input_tokens_seen": 14218488, + "step": 21785 + }, + { + "epoch": 12.847877358490566, + "grad_norm": 3.0132381916046143, + "learning_rate": 3.4157214800475746e-06, + "loss": 0.316, + "num_input_tokens_seen": 14221752, + "step": 21790 + }, + { + "epoch": 12.850825471698114, + "grad_norm": 2.1272871494293213, + "learning_rate": 3.4132815380668577e-06, + "loss": 0.2872, + "num_input_tokens_seen": 14224888, + "step": 21795 + }, + { + "epoch": 12.85377358490566, + "grad_norm": 3.0509471893310547, + "learning_rate": 3.410842016176691e-06, + "loss": 0.3791, + "num_input_tokens_seen": 14228568, + "step": 21800 + }, + { + "epoch": 12.856721698113208, + "grad_norm": 3.1193125247955322, + "learning_rate": 3.4084029150229503e-06, + "loss": 0.2852, + "num_input_tokens_seen": 14232088, + "step": 21805 + }, + { + "epoch": 12.859669811320755, + "grad_norm": 1.9920432567596436, + "learning_rate": 3.4059642352513965e-06, + "loss": 0.3437, + "num_input_tokens_seen": 14235448, + "step": 21810 + }, + { + "epoch": 12.862617924528301, + "grad_norm": 3.6852011680603027, + "learning_rate": 3.4035259775076813e-06, + "loss": 0.3702, + "num_input_tokens_seen": 14238904, + "step": 21815 + }, + { + "epoch": 12.86556603773585, + "grad_norm": 2.139373540878296, + "learning_rate": 3.401088142437344e-06, + "loss": 0.2566, + "num_input_tokens_seen": 14242840, + "step": 21820 + }, + { + "epoch": 12.868514150943396, + "grad_norm": 2.9665040969848633, + "learning_rate": 3.398650730685813e-06, + "loss": 0.2887, + "num_input_tokens_seen": 14245496, + "step": 21825 + }, + { + "epoch": 12.871462264150944, + "grad_norm": 1.9119702577590942, + "learning_rate": 3.396213742898401e-06, + "loss": 0.2586, + "num_input_tokens_seen": 14250168, + "step": 21830 + }, + { + "epoch": 12.87441037735849, + "grad_norm": 3.0877885818481445, + "learning_rate": 3.3937771797203134e-06, + "loss": 0.4301, + "num_input_tokens_seen": 14253752, + "step": 21835 + }, + { + "epoch": 12.877358490566039, + "grad_norm": 1.91194748878479, + "learning_rate": 3.391341041796641e-06, + "loss": 0.2607, + "num_input_tokens_seen": 14257560, + "step": 21840 + }, + { + "epoch": 12.880306603773585, + "grad_norm": 3.223383903503418, + "learning_rate": 3.3889053297723585e-06, + "loss": 0.2537, + "num_input_tokens_seen": 14262008, + "step": 21845 + }, + { + "epoch": 12.883254716981131, + "grad_norm": 2.230386972427368, + "learning_rate": 3.3864700442923342e-06, + "loss": 0.288, + "num_input_tokens_seen": 14265560, + "step": 21850 + }, + { + "epoch": 12.88620283018868, + "grad_norm": 6.485767364501953, + "learning_rate": 3.384035186001318e-06, + "loss": 0.3272, + "num_input_tokens_seen": 14268536, + "step": 21855 + }, + { + "epoch": 12.889150943396226, + "grad_norm": 6.27098274230957, + "learning_rate": 3.381600755543953e-06, + "loss": 0.309, + "num_input_tokens_seen": 14270904, + "step": 21860 + }, + { + "epoch": 12.892099056603774, + "grad_norm": 7.32874870300293, + "learning_rate": 3.3791667535647615e-06, + "loss": 0.3665, + "num_input_tokens_seen": 14274232, + "step": 21865 + }, + { + "epoch": 12.89504716981132, + "grad_norm": 4.632501125335693, + "learning_rate": 3.3767331807081584e-06, + "loss": 0.3601, + "num_input_tokens_seen": 14278808, + "step": 21870 + }, + { + "epoch": 12.897995283018869, + "grad_norm": 3.200711488723755, + "learning_rate": 3.374300037618442e-06, + "loss": 0.2633, + "num_input_tokens_seen": 14281560, + "step": 21875 + }, + { + "epoch": 12.900943396226415, + "grad_norm": 4.05991268157959, + "learning_rate": 3.371867324939796e-06, + "loss": 0.2684, + "num_input_tokens_seen": 14284760, + "step": 21880 + }, + { + "epoch": 12.903891509433961, + "grad_norm": 6.064416408538818, + "learning_rate": 3.369435043316293e-06, + "loss": 0.4136, + "num_input_tokens_seen": 14287320, + "step": 21885 + }, + { + "epoch": 12.90683962264151, + "grad_norm": 3.5634496212005615, + "learning_rate": 3.36700319339189e-06, + "loss": 0.3875, + "num_input_tokens_seen": 14290840, + "step": 21890 + }, + { + "epoch": 12.909787735849056, + "grad_norm": 2.4734292030334473, + "learning_rate": 3.3645717758104286e-06, + "loss": 0.3353, + "num_input_tokens_seen": 14295736, + "step": 21895 + }, + { + "epoch": 12.912735849056604, + "grad_norm": 2.804713726043701, + "learning_rate": 3.3621407912156383e-06, + "loss": 0.3002, + "num_input_tokens_seen": 14299000, + "step": 21900 + }, + { + "epoch": 12.91568396226415, + "grad_norm": 2.8870646953582764, + "learning_rate": 3.3597102402511326e-06, + "loss": 0.3823, + "num_input_tokens_seen": 14302296, + "step": 21905 + }, + { + "epoch": 12.918632075471699, + "grad_norm": 4.311373233795166, + "learning_rate": 3.3572801235604093e-06, + "loss": 0.302, + "num_input_tokens_seen": 14305336, + "step": 21910 + }, + { + "epoch": 12.921580188679245, + "grad_norm": 3.7751832008361816, + "learning_rate": 3.3548504417868538e-06, + "loss": 0.354, + "num_input_tokens_seen": 14309240, + "step": 21915 + }, + { + "epoch": 12.924528301886792, + "grad_norm": 2.933415651321411, + "learning_rate": 3.352421195573734e-06, + "loss": 0.3393, + "num_input_tokens_seen": 14313368, + "step": 21920 + }, + { + "epoch": 12.92747641509434, + "grad_norm": 3.7907328605651855, + "learning_rate": 3.3499923855642026e-06, + "loss": 0.496, + "num_input_tokens_seen": 14316152, + "step": 21925 + }, + { + "epoch": 12.930424528301886, + "grad_norm": 2.819135904312134, + "learning_rate": 3.3475640124012986e-06, + "loss": 0.3256, + "num_input_tokens_seen": 14319032, + "step": 21930 + }, + { + "epoch": 12.933372641509434, + "grad_norm": 1.99680495262146, + "learning_rate": 3.345136076727945e-06, + "loss": 0.3823, + "num_input_tokens_seen": 14322264, + "step": 21935 + }, + { + "epoch": 12.93632075471698, + "grad_norm": 3.8027737140655518, + "learning_rate": 3.3427085791869453e-06, + "loss": 0.4122, + "num_input_tokens_seen": 14325528, + "step": 21940 + }, + { + "epoch": 12.939268867924529, + "grad_norm": 2.7927749156951904, + "learning_rate": 3.3402815204209926e-06, + "loss": 0.3185, + "num_input_tokens_seen": 14328600, + "step": 21945 + }, + { + "epoch": 12.942216981132075, + "grad_norm": 2.7212982177734375, + "learning_rate": 3.337854901072659e-06, + "loss": 0.3109, + "num_input_tokens_seen": 14331832, + "step": 21950 + }, + { + "epoch": 12.945165094339622, + "grad_norm": 2.5636136531829834, + "learning_rate": 3.3354287217844056e-06, + "loss": 0.2806, + "num_input_tokens_seen": 14334328, + "step": 21955 + }, + { + "epoch": 12.94811320754717, + "grad_norm": 3.317197561264038, + "learning_rate": 3.3330029831985712e-06, + "loss": 0.3336, + "num_input_tokens_seen": 14337816, + "step": 21960 + }, + { + "epoch": 12.951061320754716, + "grad_norm": 2.771249771118164, + "learning_rate": 3.330577685957382e-06, + "loss": 0.3657, + "num_input_tokens_seen": 14340536, + "step": 21965 + }, + { + "epoch": 12.954009433962264, + "grad_norm": 2.747359275817871, + "learning_rate": 3.3281528307029454e-06, + "loss": 0.3736, + "num_input_tokens_seen": 14343448, + "step": 21970 + }, + { + "epoch": 12.95695754716981, + "grad_norm": 2.5416488647460938, + "learning_rate": 3.325728418077251e-06, + "loss": 0.3106, + "num_input_tokens_seen": 14347320, + "step": 21975 + }, + { + "epoch": 12.959905660377359, + "grad_norm": 3.9098031520843506, + "learning_rate": 3.3233044487221744e-06, + "loss": 0.3581, + "num_input_tokens_seen": 14350392, + "step": 21980 + }, + { + "epoch": 12.962853773584905, + "grad_norm": 3.4075942039489746, + "learning_rate": 3.3208809232794715e-06, + "loss": 0.356, + "num_input_tokens_seen": 14354296, + "step": 21985 + }, + { + "epoch": 12.965801886792454, + "grad_norm": 2.6556880474090576, + "learning_rate": 3.3184578423907797e-06, + "loss": 0.2802, + "num_input_tokens_seen": 14357688, + "step": 21990 + }, + { + "epoch": 12.96875, + "grad_norm": 2.916381597518921, + "learning_rate": 3.3160352066976224e-06, + "loss": 0.4164, + "num_input_tokens_seen": 14360312, + "step": 21995 + }, + { + "epoch": 12.971698113207546, + "grad_norm": 4.432518005371094, + "learning_rate": 3.3136130168414003e-06, + "loss": 0.4144, + "num_input_tokens_seen": 14363288, + "step": 22000 + }, + { + "epoch": 12.974646226415095, + "grad_norm": 2.3157541751861572, + "learning_rate": 3.311191273463401e-06, + "loss": 0.2744, + "num_input_tokens_seen": 14367032, + "step": 22005 + }, + { + "epoch": 12.977594339622641, + "grad_norm": 3.246169328689575, + "learning_rate": 3.3087699772047908e-06, + "loss": 0.4209, + "num_input_tokens_seen": 14370296, + "step": 22010 + }, + { + "epoch": 12.98054245283019, + "grad_norm": 3.3660190105438232, + "learning_rate": 3.3063491287066164e-06, + "loss": 0.4005, + "num_input_tokens_seen": 14372792, + "step": 22015 + }, + { + "epoch": 12.983490566037736, + "grad_norm": 2.3737194538116455, + "learning_rate": 3.303928728609811e-06, + "loss": 0.2704, + "num_input_tokens_seen": 14375928, + "step": 22020 + }, + { + "epoch": 12.986438679245284, + "grad_norm": 3.029210090637207, + "learning_rate": 3.3015087775551835e-06, + "loss": 0.3336, + "num_input_tokens_seen": 14379608, + "step": 22025 + }, + { + "epoch": 12.98938679245283, + "grad_norm": 3.567734956741333, + "learning_rate": 3.299089276183427e-06, + "loss": 0.3712, + "num_input_tokens_seen": 14382872, + "step": 22030 + }, + { + "epoch": 12.992334905660378, + "grad_norm": 2.584099292755127, + "learning_rate": 3.2966702251351157e-06, + "loss": 0.3521, + "num_input_tokens_seen": 14386136, + "step": 22035 + }, + { + "epoch": 12.995283018867925, + "grad_norm": 3.450352191925049, + "learning_rate": 3.2942516250507035e-06, + "loss": 0.3022, + "num_input_tokens_seen": 14389912, + "step": 22040 + }, + { + "epoch": 12.998231132075471, + "grad_norm": 3.222008228302002, + "learning_rate": 3.2918334765705227e-06, + "loss": 0.279, + "num_input_tokens_seen": 14393368, + "step": 22045 + }, + { + "epoch": 13.00117924528302, + "grad_norm": 2.2067952156066895, + "learning_rate": 3.289415780334792e-06, + "loss": 0.2082, + "num_input_tokens_seen": 14396072, + "step": 22050 + }, + { + "epoch": 13.004127358490566, + "grad_norm": 4.023777008056641, + "learning_rate": 3.2869985369836067e-06, + "loss": 0.3529, + "num_input_tokens_seen": 14398920, + "step": 22055 + }, + { + "epoch": 13.007075471698114, + "grad_norm": 1.790043830871582, + "learning_rate": 3.2845817471569406e-06, + "loss": 0.2676, + "num_input_tokens_seen": 14403080, + "step": 22060 + }, + { + "epoch": 13.01002358490566, + "grad_norm": 4.008598804473877, + "learning_rate": 3.28216541149465e-06, + "loss": 0.351, + "num_input_tokens_seen": 14407176, + "step": 22065 + }, + { + "epoch": 13.012971698113208, + "grad_norm": 4.423854827880859, + "learning_rate": 3.2797495306364707e-06, + "loss": 0.2045, + "num_input_tokens_seen": 14409864, + "step": 22070 + }, + { + "epoch": 13.015919811320755, + "grad_norm": 6.915042400360107, + "learning_rate": 3.2773341052220174e-06, + "loss": 0.442, + "num_input_tokens_seen": 14413384, + "step": 22075 + }, + { + "epoch": 13.018867924528301, + "grad_norm": 19.09061622619629, + "learning_rate": 3.274919135890783e-06, + "loss": 0.449, + "num_input_tokens_seen": 14415912, + "step": 22080 + }, + { + "epoch": 13.02181603773585, + "grad_norm": 3.322899103164673, + "learning_rate": 3.2725046232821424e-06, + "loss": 0.2272, + "num_input_tokens_seen": 14418792, + "step": 22085 + }, + { + "epoch": 13.024764150943396, + "grad_norm": 3.04266619682312, + "learning_rate": 3.270090568035348e-06, + "loss": 0.4411, + "num_input_tokens_seen": 14421960, + "step": 22090 + }, + { + "epoch": 13.027712264150944, + "grad_norm": 3.0039806365966797, + "learning_rate": 3.2676769707895306e-06, + "loss": 0.3193, + "num_input_tokens_seen": 14425000, + "step": 22095 + }, + { + "epoch": 13.03066037735849, + "grad_norm": 2.5538010597229004, + "learning_rate": 3.2652638321837015e-06, + "loss": 0.385, + "num_input_tokens_seen": 14429576, + "step": 22100 + }, + { + "epoch": 13.033608490566039, + "grad_norm": 2.373823881149292, + "learning_rate": 3.2628511528567497e-06, + "loss": 0.3257, + "num_input_tokens_seen": 14432168, + "step": 22105 + }, + { + "epoch": 13.036556603773585, + "grad_norm": 2.758582353591919, + "learning_rate": 3.2604389334474407e-06, + "loss": 0.3248, + "num_input_tokens_seen": 14435848, + "step": 22110 + }, + { + "epoch": 13.039504716981131, + "grad_norm": 3.8507070541381836, + "learning_rate": 3.2580271745944224e-06, + "loss": 0.412, + "num_input_tokens_seen": 14439336, + "step": 22115 + }, + { + "epoch": 13.04245283018868, + "grad_norm": 5.218119144439697, + "learning_rate": 3.255615876936217e-06, + "loss": 0.2168, + "num_input_tokens_seen": 14442568, + "step": 22120 + }, + { + "epoch": 13.045400943396226, + "grad_norm": 3.8426151275634766, + "learning_rate": 3.2532050411112248e-06, + "loss": 0.2555, + "num_input_tokens_seen": 14445736, + "step": 22125 + }, + { + "epoch": 13.048349056603774, + "grad_norm": 3.670010566711426, + "learning_rate": 3.2507946677577274e-06, + "loss": 0.2493, + "num_input_tokens_seen": 14450152, + "step": 22130 + }, + { + "epoch": 13.05129716981132, + "grad_norm": 3.5470523834228516, + "learning_rate": 3.2483847575138807e-06, + "loss": 0.3562, + "num_input_tokens_seen": 14453672, + "step": 22135 + }, + { + "epoch": 13.054245283018869, + "grad_norm": 3.439145565032959, + "learning_rate": 3.245975311017716e-06, + "loss": 0.3402, + "num_input_tokens_seen": 14456872, + "step": 22140 + }, + { + "epoch": 13.057193396226415, + "grad_norm": 3.6502089500427246, + "learning_rate": 3.2435663289071486e-06, + "loss": 0.3406, + "num_input_tokens_seen": 14460296, + "step": 22145 + }, + { + "epoch": 13.060141509433961, + "grad_norm": 1.867048978805542, + "learning_rate": 3.241157811819966e-06, + "loss": 0.3162, + "num_input_tokens_seen": 14463048, + "step": 22150 + }, + { + "epoch": 13.06308962264151, + "grad_norm": 4.080491542816162, + "learning_rate": 3.2387497603938327e-06, + "loss": 0.3425, + "num_input_tokens_seen": 14465864, + "step": 22155 + }, + { + "epoch": 13.066037735849056, + "grad_norm": 7.849427700042725, + "learning_rate": 3.2363421752662903e-06, + "loss": 0.4108, + "num_input_tokens_seen": 14468744, + "step": 22160 + }, + { + "epoch": 13.068985849056604, + "grad_norm": 3.0759518146514893, + "learning_rate": 3.233935057074759e-06, + "loss": 0.2618, + "num_input_tokens_seen": 14472360, + "step": 22165 + }, + { + "epoch": 13.07193396226415, + "grad_norm": 3.8710527420043945, + "learning_rate": 3.2315284064565324e-06, + "loss": 0.3395, + "num_input_tokens_seen": 14474824, + "step": 22170 + }, + { + "epoch": 13.074882075471699, + "grad_norm": 2.691591739654541, + "learning_rate": 3.2291222240487813e-06, + "loss": 0.2309, + "num_input_tokens_seen": 14477544, + "step": 22175 + }, + { + "epoch": 13.077830188679245, + "grad_norm": 3.781956672668457, + "learning_rate": 3.226716510488554e-06, + "loss": 0.3505, + "num_input_tokens_seen": 14481224, + "step": 22180 + }, + { + "epoch": 13.080778301886792, + "grad_norm": 1.726909875869751, + "learning_rate": 3.224311266412773e-06, + "loss": 0.3061, + "num_input_tokens_seen": 14483688, + "step": 22185 + }, + { + "epoch": 13.08372641509434, + "grad_norm": 3.7889645099639893, + "learning_rate": 3.2219064924582366e-06, + "loss": 0.337, + "num_input_tokens_seen": 14487112, + "step": 22190 + }, + { + "epoch": 13.086674528301886, + "grad_norm": 2.6657702922821045, + "learning_rate": 3.2195021892616197e-06, + "loss": 0.3797, + "num_input_tokens_seen": 14492104, + "step": 22195 + }, + { + "epoch": 13.089622641509434, + "grad_norm": 2.5574259757995605, + "learning_rate": 3.217098357459472e-06, + "loss": 0.3244, + "num_input_tokens_seen": 14496040, + "step": 22200 + }, + { + "epoch": 13.09257075471698, + "grad_norm": 3.7597057819366455, + "learning_rate": 3.214694997688217e-06, + "loss": 0.4146, + "num_input_tokens_seen": 14499432, + "step": 22205 + }, + { + "epoch": 13.095518867924529, + "grad_norm": 3.8260746002197266, + "learning_rate": 3.2122921105841572e-06, + "loss": 0.3313, + "num_input_tokens_seen": 14502664, + "step": 22210 + }, + { + "epoch": 13.098466981132075, + "grad_norm": 2.786687135696411, + "learning_rate": 3.2098896967834647e-06, + "loss": 0.4458, + "num_input_tokens_seen": 14506440, + "step": 22215 + }, + { + "epoch": 13.101415094339623, + "grad_norm": 3.3483920097351074, + "learning_rate": 3.2074877569221896e-06, + "loss": 0.284, + "num_input_tokens_seen": 14509160, + "step": 22220 + }, + { + "epoch": 13.10436320754717, + "grad_norm": 3.296718120574951, + "learning_rate": 3.205086291636257e-06, + "loss": 0.413, + "num_input_tokens_seen": 14512520, + "step": 22225 + }, + { + "epoch": 13.107311320754716, + "grad_norm": 11.569144248962402, + "learning_rate": 3.202685301561463e-06, + "loss": 0.5629, + "num_input_tokens_seen": 14517096, + "step": 22230 + }, + { + "epoch": 13.110259433962264, + "grad_norm": 4.862022399902344, + "learning_rate": 3.200284787333482e-06, + "loss": 0.2871, + "num_input_tokens_seen": 14519912, + "step": 22235 + }, + { + "epoch": 13.11320754716981, + "grad_norm": 2.805330276489258, + "learning_rate": 3.1978847495878595e-06, + "loss": 0.2783, + "num_input_tokens_seen": 14524680, + "step": 22240 + }, + { + "epoch": 13.116155660377359, + "grad_norm": 3.4541144371032715, + "learning_rate": 3.1954851889600176e-06, + "loss": 0.3445, + "num_input_tokens_seen": 14527912, + "step": 22245 + }, + { + "epoch": 13.119103773584905, + "grad_norm": 5.086663722991943, + "learning_rate": 3.1930861060852485e-06, + "loss": 0.2862, + "num_input_tokens_seen": 14531080, + "step": 22250 + }, + { + "epoch": 13.122051886792454, + "grad_norm": 2.7891407012939453, + "learning_rate": 3.1906875015987194e-06, + "loss": 0.3712, + "num_input_tokens_seen": 14534504, + "step": 22255 + }, + { + "epoch": 13.125, + "grad_norm": 5.797956943511963, + "learning_rate": 3.188289376135473e-06, + "loss": 0.304, + "num_input_tokens_seen": 14538056, + "step": 22260 + }, + { + "epoch": 13.127948113207546, + "grad_norm": 4.2157158851623535, + "learning_rate": 3.1858917303304213e-06, + "loss": 0.3575, + "num_input_tokens_seen": 14540968, + "step": 22265 + }, + { + "epoch": 13.130896226415095, + "grad_norm": 3.279102325439453, + "learning_rate": 3.1834945648183535e-06, + "loss": 0.2763, + "num_input_tokens_seen": 14544520, + "step": 22270 + }, + { + "epoch": 13.133844339622641, + "grad_norm": 6.092158317565918, + "learning_rate": 3.1810978802339283e-06, + "loss": 0.3363, + "num_input_tokens_seen": 14547176, + "step": 22275 + }, + { + "epoch": 13.13679245283019, + "grad_norm": 3.129951000213623, + "learning_rate": 3.1787016772116767e-06, + "loss": 0.3015, + "num_input_tokens_seen": 14552840, + "step": 22280 + }, + { + "epoch": 13.139740566037736, + "grad_norm": 3.3574717044830322, + "learning_rate": 3.1763059563860073e-06, + "loss": 0.3367, + "num_input_tokens_seen": 14555752, + "step": 22285 + }, + { + "epoch": 13.142688679245284, + "grad_norm": 3.4803028106689453, + "learning_rate": 3.1739107183911953e-06, + "loss": 0.4115, + "num_input_tokens_seen": 14558952, + "step": 22290 + }, + { + "epoch": 13.14563679245283, + "grad_norm": 2.781121253967285, + "learning_rate": 3.1715159638613898e-06, + "loss": 0.382, + "num_input_tokens_seen": 14563560, + "step": 22295 + }, + { + "epoch": 13.148584905660377, + "grad_norm": 6.066686153411865, + "learning_rate": 3.1691216934306134e-06, + "loss": 0.3265, + "num_input_tokens_seen": 14566984, + "step": 22300 + }, + { + "epoch": 13.151533018867925, + "grad_norm": 4.6800665855407715, + "learning_rate": 3.16672790773276e-06, + "loss": 0.2904, + "num_input_tokens_seen": 14569992, + "step": 22305 + }, + { + "epoch": 13.154481132075471, + "grad_norm": 2.685800075531006, + "learning_rate": 3.164334607401593e-06, + "loss": 0.2653, + "num_input_tokens_seen": 14573544, + "step": 22310 + }, + { + "epoch": 13.15742924528302, + "grad_norm": 2.884449005126953, + "learning_rate": 3.1619417930707506e-06, + "loss": 0.3992, + "num_input_tokens_seen": 14576584, + "step": 22315 + }, + { + "epoch": 13.160377358490566, + "grad_norm": 3.444021463394165, + "learning_rate": 3.1595494653737408e-06, + "loss": 0.3906, + "num_input_tokens_seen": 14579624, + "step": 22320 + }, + { + "epoch": 13.163325471698114, + "grad_norm": 4.368343830108643, + "learning_rate": 3.1571576249439408e-06, + "loss": 0.4239, + "num_input_tokens_seen": 14583304, + "step": 22325 + }, + { + "epoch": 13.16627358490566, + "grad_norm": 3.0407683849334717, + "learning_rate": 3.1547662724146e-06, + "loss": 0.2929, + "num_input_tokens_seen": 14586120, + "step": 22330 + }, + { + "epoch": 13.169221698113208, + "grad_norm": 5.003282070159912, + "learning_rate": 3.1523754084188436e-06, + "loss": 0.2204, + "num_input_tokens_seen": 14589544, + "step": 22335 + }, + { + "epoch": 13.172169811320755, + "grad_norm": 2.7133677005767822, + "learning_rate": 3.149985033589661e-06, + "loss": 0.3502, + "num_input_tokens_seen": 14593576, + "step": 22340 + }, + { + "epoch": 13.175117924528301, + "grad_norm": 2.7527003288269043, + "learning_rate": 3.147595148559912e-06, + "loss": 0.4278, + "num_input_tokens_seen": 14595976, + "step": 22345 + }, + { + "epoch": 13.17806603773585, + "grad_norm": 2.8890938758850098, + "learning_rate": 3.1452057539623328e-06, + "loss": 0.2835, + "num_input_tokens_seen": 14600200, + "step": 22350 + }, + { + "epoch": 13.181014150943396, + "grad_norm": 3.151230812072754, + "learning_rate": 3.142816850429523e-06, + "loss": 0.342, + "num_input_tokens_seen": 14603400, + "step": 22355 + }, + { + "epoch": 13.183962264150944, + "grad_norm": 4.360023498535156, + "learning_rate": 3.1404284385939552e-06, + "loss": 0.3212, + "num_input_tokens_seen": 14606280, + "step": 22360 + }, + { + "epoch": 13.18691037735849, + "grad_norm": 2.9618966579437256, + "learning_rate": 3.138040519087975e-06, + "loss": 0.364, + "num_input_tokens_seen": 14609224, + "step": 22365 + }, + { + "epoch": 13.189858490566039, + "grad_norm": 3.9195735454559326, + "learning_rate": 3.13565309254379e-06, + "loss": 0.3287, + "num_input_tokens_seen": 14612008, + "step": 22370 + }, + { + "epoch": 13.192806603773585, + "grad_norm": 3.0627336502075195, + "learning_rate": 3.1332661595934845e-06, + "loss": 0.3505, + "num_input_tokens_seen": 14614888, + "step": 22375 + }, + { + "epoch": 13.195754716981131, + "grad_norm": 2.8627541065216064, + "learning_rate": 3.130879720869008e-06, + "loss": 0.4883, + "num_input_tokens_seen": 14617960, + "step": 22380 + }, + { + "epoch": 13.19870283018868, + "grad_norm": 2.579274892807007, + "learning_rate": 3.1284937770021815e-06, + "loss": 0.2341, + "num_input_tokens_seen": 14620776, + "step": 22385 + }, + { + "epoch": 13.201650943396226, + "grad_norm": 3.149773597717285, + "learning_rate": 3.1261083286246916e-06, + "loss": 0.3882, + "num_input_tokens_seen": 14624392, + "step": 22390 + }, + { + "epoch": 13.204599056603774, + "grad_norm": 4.622279644012451, + "learning_rate": 3.1237233763680997e-06, + "loss": 0.2625, + "num_input_tokens_seen": 14628136, + "step": 22395 + }, + { + "epoch": 13.20754716981132, + "grad_norm": 2.9745142459869385, + "learning_rate": 3.1213389208638303e-06, + "loss": 0.2881, + "num_input_tokens_seen": 14632264, + "step": 22400 + }, + { + "epoch": 13.210495283018869, + "grad_norm": 4.559311866760254, + "learning_rate": 3.1189549627431757e-06, + "loss": 0.3104, + "num_input_tokens_seen": 14634952, + "step": 22405 + }, + { + "epoch": 13.213443396226415, + "grad_norm": 2.2010486125946045, + "learning_rate": 3.116571502637304e-06, + "loss": 0.1907, + "num_input_tokens_seen": 14637864, + "step": 22410 + }, + { + "epoch": 13.216391509433961, + "grad_norm": 4.6809515953063965, + "learning_rate": 3.1141885411772434e-06, + "loss": 0.3162, + "num_input_tokens_seen": 14640296, + "step": 22415 + }, + { + "epoch": 13.21933962264151, + "grad_norm": 3.5405428409576416, + "learning_rate": 3.111806078993893e-06, + "loss": 0.2773, + "num_input_tokens_seen": 14643688, + "step": 22420 + }, + { + "epoch": 13.222287735849056, + "grad_norm": 2.863039493560791, + "learning_rate": 3.1094241167180223e-06, + "loss": 0.4029, + "num_input_tokens_seen": 14646280, + "step": 22425 + }, + { + "epoch": 13.225235849056604, + "grad_norm": 2.4853432178497314, + "learning_rate": 3.1070426549802623e-06, + "loss": 0.3263, + "num_input_tokens_seen": 14650184, + "step": 22430 + }, + { + "epoch": 13.22818396226415, + "grad_norm": 4.08460807800293, + "learning_rate": 3.1046616944111196e-06, + "loss": 0.3968, + "num_input_tokens_seen": 14654152, + "step": 22435 + }, + { + "epoch": 13.231132075471699, + "grad_norm": 5.702762603759766, + "learning_rate": 3.1022812356409606e-06, + "loss": 0.3581, + "num_input_tokens_seen": 14656712, + "step": 22440 + }, + { + "epoch": 13.234080188679245, + "grad_norm": 2.6994597911834717, + "learning_rate": 3.0999012793000244e-06, + "loss": 0.3072, + "num_input_tokens_seen": 14659880, + "step": 22445 + }, + { + "epoch": 13.237028301886792, + "grad_norm": 2.3515756130218506, + "learning_rate": 3.097521826018414e-06, + "loss": 0.2346, + "num_input_tokens_seen": 14663752, + "step": 22450 + }, + { + "epoch": 13.23997641509434, + "grad_norm": 3.334465503692627, + "learning_rate": 3.0951428764260973e-06, + "loss": 0.4994, + "num_input_tokens_seen": 14666984, + "step": 22455 + }, + { + "epoch": 13.242924528301886, + "grad_norm": 3.8413703441619873, + "learning_rate": 3.092764431152915e-06, + "loss": 0.3028, + "num_input_tokens_seen": 14669896, + "step": 22460 + }, + { + "epoch": 13.245872641509434, + "grad_norm": 2.1272010803222656, + "learning_rate": 3.0903864908285693e-06, + "loss": 0.2516, + "num_input_tokens_seen": 14673704, + "step": 22465 + }, + { + "epoch": 13.24882075471698, + "grad_norm": 4.125630855560303, + "learning_rate": 3.088009056082629e-06, + "loss": 0.3128, + "num_input_tokens_seen": 14676680, + "step": 22470 + }, + { + "epoch": 13.251768867924529, + "grad_norm": 5.0732550621032715, + "learning_rate": 3.0856321275445324e-06, + "loss": 0.3558, + "num_input_tokens_seen": 14679656, + "step": 22475 + }, + { + "epoch": 13.254716981132075, + "grad_norm": 13.07447338104248, + "learning_rate": 3.0832557058435808e-06, + "loss": 0.4397, + "num_input_tokens_seen": 14681736, + "step": 22480 + }, + { + "epoch": 13.257665094339623, + "grad_norm": 5.427025318145752, + "learning_rate": 3.0808797916089405e-06, + "loss": 0.3563, + "num_input_tokens_seen": 14685256, + "step": 22485 + }, + { + "epoch": 13.26061320754717, + "grad_norm": 3.1773502826690674, + "learning_rate": 3.078504385469647e-06, + "loss": 0.3046, + "num_input_tokens_seen": 14688776, + "step": 22490 + }, + { + "epoch": 13.263561320754716, + "grad_norm": 3.1019439697265625, + "learning_rate": 3.076129488054599e-06, + "loss": 0.3171, + "num_input_tokens_seen": 14691720, + "step": 22495 + }, + { + "epoch": 13.266509433962264, + "grad_norm": 4.894453048706055, + "learning_rate": 3.0737550999925604e-06, + "loss": 0.3176, + "num_input_tokens_seen": 14694376, + "step": 22500 + }, + { + "epoch": 13.26945754716981, + "grad_norm": 4.793021202087402, + "learning_rate": 3.0713812219121604e-06, + "loss": 0.3899, + "num_input_tokens_seen": 14698376, + "step": 22505 + }, + { + "epoch": 13.272405660377359, + "grad_norm": 4.297952651977539, + "learning_rate": 3.0690078544418934e-06, + "loss": 0.3128, + "num_input_tokens_seen": 14701704, + "step": 22510 + }, + { + "epoch": 13.275353773584905, + "grad_norm": 5.058052062988281, + "learning_rate": 3.0666349982101198e-06, + "loss": 0.4028, + "num_input_tokens_seen": 14704936, + "step": 22515 + }, + { + "epoch": 13.278301886792454, + "grad_norm": 2.7775187492370605, + "learning_rate": 3.0642626538450627e-06, + "loss": 0.3548, + "num_input_tokens_seen": 14707976, + "step": 22520 + }, + { + "epoch": 13.28125, + "grad_norm": 6.442705154418945, + "learning_rate": 3.061890821974809e-06, + "loss": 0.4021, + "num_input_tokens_seen": 14710728, + "step": 22525 + }, + { + "epoch": 13.284198113207546, + "grad_norm": 4.321109771728516, + "learning_rate": 3.059519503227313e-06, + "loss": 0.361, + "num_input_tokens_seen": 14714472, + "step": 22530 + }, + { + "epoch": 13.287146226415095, + "grad_norm": 8.554970741271973, + "learning_rate": 3.057148698230393e-06, + "loss": 0.2191, + "num_input_tokens_seen": 14716840, + "step": 22535 + }, + { + "epoch": 13.290094339622641, + "grad_norm": 3.132094383239746, + "learning_rate": 3.0547784076117294e-06, + "loss": 0.3205, + "num_input_tokens_seen": 14719336, + "step": 22540 + }, + { + "epoch": 13.29304245283019, + "grad_norm": 5.446914196014404, + "learning_rate": 3.0524086319988635e-06, + "loss": 0.3142, + "num_input_tokens_seen": 14721896, + "step": 22545 + }, + { + "epoch": 13.295990566037736, + "grad_norm": 3.332773208618164, + "learning_rate": 3.0500393720192074e-06, + "loss": 0.2795, + "num_input_tokens_seen": 14725800, + "step": 22550 + }, + { + "epoch": 13.298938679245284, + "grad_norm": 8.36528205871582, + "learning_rate": 3.047670628300031e-06, + "loss": 0.4061, + "num_input_tokens_seen": 14729480, + "step": 22555 + }, + { + "epoch": 13.30188679245283, + "grad_norm": 2.5486607551574707, + "learning_rate": 3.0453024014684694e-06, + "loss": 0.2783, + "num_input_tokens_seen": 14732680, + "step": 22560 + }, + { + "epoch": 13.304834905660377, + "grad_norm": 5.933586597442627, + "learning_rate": 3.0429346921515225e-06, + "loss": 0.3391, + "num_input_tokens_seen": 14736488, + "step": 22565 + }, + { + "epoch": 13.307783018867925, + "grad_norm": 7.017425537109375, + "learning_rate": 3.04056750097605e-06, + "loss": 0.3137, + "num_input_tokens_seen": 14739976, + "step": 22570 + }, + { + "epoch": 13.310731132075471, + "grad_norm": 2.932344436645508, + "learning_rate": 3.0382008285687754e-06, + "loss": 0.2779, + "num_input_tokens_seen": 14742792, + "step": 22575 + }, + { + "epoch": 13.31367924528302, + "grad_norm": 2.562885046005249, + "learning_rate": 3.035834675556287e-06, + "loss": 0.325, + "num_input_tokens_seen": 14746824, + "step": 22580 + }, + { + "epoch": 13.316627358490566, + "grad_norm": 3.401585817337036, + "learning_rate": 3.0334690425650336e-06, + "loss": 0.2529, + "num_input_tokens_seen": 14750408, + "step": 22585 + }, + { + "epoch": 13.319575471698114, + "grad_norm": 2.9116499423980713, + "learning_rate": 3.031103930221325e-06, + "loss": 0.3458, + "num_input_tokens_seen": 14753352, + "step": 22590 + }, + { + "epoch": 13.32252358490566, + "grad_norm": 3.032701015472412, + "learning_rate": 3.028739339151338e-06, + "loss": 0.2599, + "num_input_tokens_seen": 14756264, + "step": 22595 + }, + { + "epoch": 13.325471698113208, + "grad_norm": 2.722766876220703, + "learning_rate": 3.0263752699811067e-06, + "loss": 0.3395, + "num_input_tokens_seen": 14759048, + "step": 22600 + }, + { + "epoch": 13.328419811320755, + "grad_norm": 3.229226589202881, + "learning_rate": 3.0240117233365267e-06, + "loss": 0.2788, + "num_input_tokens_seen": 14762248, + "step": 22605 + }, + { + "epoch": 13.331367924528301, + "grad_norm": 2.351168155670166, + "learning_rate": 3.0216486998433604e-06, + "loss": 0.2963, + "num_input_tokens_seen": 14765544, + "step": 22610 + }, + { + "epoch": 13.33431603773585, + "grad_norm": 2.5547897815704346, + "learning_rate": 3.0192862001272273e-06, + "loss": 0.2308, + "num_input_tokens_seen": 14768584, + "step": 22615 + }, + { + "epoch": 13.337264150943396, + "grad_norm": 3.550908088684082, + "learning_rate": 3.0169242248136066e-06, + "loss": 0.3515, + "num_input_tokens_seen": 14771496, + "step": 22620 + }, + { + "epoch": 13.340212264150944, + "grad_norm": 3.6083719730377197, + "learning_rate": 3.0145627745278457e-06, + "loss": 0.3583, + "num_input_tokens_seen": 14775016, + "step": 22625 + }, + { + "epoch": 13.34316037735849, + "grad_norm": 1.5629523992538452, + "learning_rate": 3.0122018498951478e-06, + "loss": 0.4992, + "num_input_tokens_seen": 14781096, + "step": 22630 + }, + { + "epoch": 13.346108490566039, + "grad_norm": 7.021904468536377, + "learning_rate": 3.0098414515405765e-06, + "loss": 0.307, + "num_input_tokens_seen": 14783528, + "step": 22635 + }, + { + "epoch": 13.349056603773585, + "grad_norm": 3.5611305236816406, + "learning_rate": 3.0074815800890576e-06, + "loss": 0.3143, + "num_input_tokens_seen": 14786728, + "step": 22640 + }, + { + "epoch": 13.352004716981131, + "grad_norm": 4.927103042602539, + "learning_rate": 3.005122236165378e-06, + "loss": 0.3502, + "num_input_tokens_seen": 14789480, + "step": 22645 + }, + { + "epoch": 13.35495283018868, + "grad_norm": 2.151087522506714, + "learning_rate": 3.0027634203941847e-06, + "loss": 0.2423, + "num_input_tokens_seen": 14792488, + "step": 22650 + }, + { + "epoch": 13.357900943396226, + "grad_norm": 6.692821502685547, + "learning_rate": 3.0004051333999816e-06, + "loss": 0.3403, + "num_input_tokens_seen": 14795368, + "step": 22655 + }, + { + "epoch": 13.360849056603774, + "grad_norm": 5.832033634185791, + "learning_rate": 2.998047375807139e-06, + "loss": 0.3229, + "num_input_tokens_seen": 14799112, + "step": 22660 + }, + { + "epoch": 13.36379716981132, + "grad_norm": 7.336167812347412, + "learning_rate": 2.995690148239881e-06, + "loss": 0.2957, + "num_input_tokens_seen": 14802920, + "step": 22665 + }, + { + "epoch": 13.366745283018869, + "grad_norm": 2.8972487449645996, + "learning_rate": 2.993333451322293e-06, + "loss": 0.2759, + "num_input_tokens_seen": 14805704, + "step": 22670 + }, + { + "epoch": 13.369693396226415, + "grad_norm": 2.6393609046936035, + "learning_rate": 2.9909772856783242e-06, + "loss": 0.3963, + "num_input_tokens_seen": 14808584, + "step": 22675 + }, + { + "epoch": 13.372641509433961, + "grad_norm": 2.305396556854248, + "learning_rate": 2.988621651931777e-06, + "loss": 0.2733, + "num_input_tokens_seen": 14813512, + "step": 22680 + }, + { + "epoch": 13.37558962264151, + "grad_norm": 2.9063432216644287, + "learning_rate": 2.986266550706315e-06, + "loss": 0.3186, + "num_input_tokens_seen": 14817096, + "step": 22685 + }, + { + "epoch": 13.378537735849056, + "grad_norm": 11.045530319213867, + "learning_rate": 2.9839119826254627e-06, + "loss": 0.3839, + "num_input_tokens_seen": 14819688, + "step": 22690 + }, + { + "epoch": 13.381485849056604, + "grad_norm": 5.467602252960205, + "learning_rate": 2.981557948312602e-06, + "loss": 0.4517, + "num_input_tokens_seen": 14822312, + "step": 22695 + }, + { + "epoch": 13.38443396226415, + "grad_norm": 4.135236740112305, + "learning_rate": 2.9792044483909733e-06, + "loss": 0.3721, + "num_input_tokens_seen": 14824744, + "step": 22700 + }, + { + "epoch": 13.387382075471699, + "grad_norm": 3.174801826477051, + "learning_rate": 2.9768514834836767e-06, + "loss": 0.4582, + "num_input_tokens_seen": 14828200, + "step": 22705 + }, + { + "epoch": 13.390330188679245, + "grad_norm": 3.8671891689300537, + "learning_rate": 2.9744990542136685e-06, + "loss": 0.3289, + "num_input_tokens_seen": 14831304, + "step": 22710 + }, + { + "epoch": 13.393278301886792, + "grad_norm": 2.4437100887298584, + "learning_rate": 2.9721471612037637e-06, + "loss": 0.4045, + "num_input_tokens_seen": 14835752, + "step": 22715 + }, + { + "epoch": 13.39622641509434, + "grad_norm": 2.2491302490234375, + "learning_rate": 2.9697958050766385e-06, + "loss": 0.3673, + "num_input_tokens_seen": 14838920, + "step": 22720 + }, + { + "epoch": 13.399174528301886, + "grad_norm": 5.602293014526367, + "learning_rate": 2.967444986454825e-06, + "loss": 0.3679, + "num_input_tokens_seen": 14842600, + "step": 22725 + }, + { + "epoch": 13.402122641509434, + "grad_norm": 3.9482178688049316, + "learning_rate": 2.9650947059607106e-06, + "loss": 0.2756, + "num_input_tokens_seen": 14845416, + "step": 22730 + }, + { + "epoch": 13.40507075471698, + "grad_norm": 5.400810718536377, + "learning_rate": 2.962744964216542e-06, + "loss": 0.3059, + "num_input_tokens_seen": 14847848, + "step": 22735 + }, + { + "epoch": 13.408018867924529, + "grad_norm": 2.8984901905059814, + "learning_rate": 2.960395761844425e-06, + "loss": 0.3736, + "num_input_tokens_seen": 14850856, + "step": 22740 + }, + { + "epoch": 13.410966981132075, + "grad_norm": 4.835386276245117, + "learning_rate": 2.95804709946632e-06, + "loss": 0.4219, + "num_input_tokens_seen": 14853704, + "step": 22745 + }, + { + "epoch": 13.413915094339623, + "grad_norm": 2.659588575363159, + "learning_rate": 2.9556989777040457e-06, + "loss": 0.233, + "num_input_tokens_seen": 14856872, + "step": 22750 + }, + { + "epoch": 13.41686320754717, + "grad_norm": 5.155787944793701, + "learning_rate": 2.9533513971792776e-06, + "loss": 0.4429, + "num_input_tokens_seen": 14859304, + "step": 22755 + }, + { + "epoch": 13.419811320754716, + "grad_norm": 2.9190332889556885, + "learning_rate": 2.9510043585135473e-06, + "loss": 0.2641, + "num_input_tokens_seen": 14862920, + "step": 22760 + }, + { + "epoch": 13.422759433962264, + "grad_norm": 2.286158561706543, + "learning_rate": 2.948657862328244e-06, + "loss": 0.3865, + "num_input_tokens_seen": 14867176, + "step": 22765 + }, + { + "epoch": 13.42570754716981, + "grad_norm": 4.415739059448242, + "learning_rate": 2.946311909244613e-06, + "loss": 0.3741, + "num_input_tokens_seen": 14870568, + "step": 22770 + }, + { + "epoch": 13.428655660377359, + "grad_norm": 3.6429312229156494, + "learning_rate": 2.9439664998837538e-06, + "loss": 0.4991, + "num_input_tokens_seen": 14873384, + "step": 22775 + }, + { + "epoch": 13.431603773584905, + "grad_norm": 4.8949480056762695, + "learning_rate": 2.941621634866626e-06, + "loss": 0.3299, + "num_input_tokens_seen": 14876648, + "step": 22780 + }, + { + "epoch": 13.434551886792454, + "grad_norm": 2.4755232334136963, + "learning_rate": 2.9392773148140406e-06, + "loss": 0.3602, + "num_input_tokens_seen": 14879976, + "step": 22785 + }, + { + "epoch": 13.4375, + "grad_norm": 2.538907766342163, + "learning_rate": 2.9369335403466676e-06, + "loss": 0.293, + "num_input_tokens_seen": 14882632, + "step": 22790 + }, + { + "epoch": 13.440448113207546, + "grad_norm": 6.156617641448975, + "learning_rate": 2.9345903120850318e-06, + "loss": 0.5075, + "num_input_tokens_seen": 14885480, + "step": 22795 + }, + { + "epoch": 13.443396226415095, + "grad_norm": 2.482489585876465, + "learning_rate": 2.932247630649512e-06, + "loss": 0.3684, + "num_input_tokens_seen": 14888328, + "step": 22800 + }, + { + "epoch": 13.446344339622641, + "grad_norm": 3.7355949878692627, + "learning_rate": 2.9299054966603424e-06, + "loss": 0.3042, + "num_input_tokens_seen": 14891688, + "step": 22805 + }, + { + "epoch": 13.44929245283019, + "grad_norm": 1.738120675086975, + "learning_rate": 2.927563910737613e-06, + "loss": 0.29, + "num_input_tokens_seen": 14894504, + "step": 22810 + }, + { + "epoch": 13.452240566037736, + "grad_norm": 6.150461673736572, + "learning_rate": 2.9252228735012722e-06, + "loss": 0.2846, + "num_input_tokens_seen": 14897128, + "step": 22815 + }, + { + "epoch": 13.455188679245284, + "grad_norm": 3.227527379989624, + "learning_rate": 2.9228823855711174e-06, + "loss": 0.2392, + "num_input_tokens_seen": 14902088, + "step": 22820 + }, + { + "epoch": 13.45813679245283, + "grad_norm": 3.3321259021759033, + "learning_rate": 2.920542447566802e-06, + "loss": 0.3595, + "num_input_tokens_seen": 14905064, + "step": 22825 + }, + { + "epoch": 13.461084905660377, + "grad_norm": 4.2693305015563965, + "learning_rate": 2.918203060107837e-06, + "loss": 0.4227, + "num_input_tokens_seen": 14908456, + "step": 22830 + }, + { + "epoch": 13.464033018867925, + "grad_norm": 5.089859962463379, + "learning_rate": 2.9158642238135813e-06, + "loss": 0.3822, + "num_input_tokens_seen": 14912456, + "step": 22835 + }, + { + "epoch": 13.466981132075471, + "grad_norm": 5.6995015144348145, + "learning_rate": 2.913525939303257e-06, + "loss": 0.413, + "num_input_tokens_seen": 14915656, + "step": 22840 + }, + { + "epoch": 13.46992924528302, + "grad_norm": 4.336443901062012, + "learning_rate": 2.9111882071959317e-06, + "loss": 0.3186, + "num_input_tokens_seen": 14918792, + "step": 22845 + }, + { + "epoch": 13.472877358490566, + "grad_norm": 3.9792988300323486, + "learning_rate": 2.908851028110532e-06, + "loss": 0.3136, + "num_input_tokens_seen": 14923112, + "step": 22850 + }, + { + "epoch": 13.475825471698114, + "grad_norm": 5.973363399505615, + "learning_rate": 2.906514402665834e-06, + "loss": 0.2736, + "num_input_tokens_seen": 14926632, + "step": 22855 + }, + { + "epoch": 13.47877358490566, + "grad_norm": 2.341519355773926, + "learning_rate": 2.9041783314804705e-06, + "loss": 0.3132, + "num_input_tokens_seen": 14929448, + "step": 22860 + }, + { + "epoch": 13.481721698113208, + "grad_norm": 3.053133249282837, + "learning_rate": 2.9018428151729238e-06, + "loss": 0.3507, + "num_input_tokens_seen": 14931752, + "step": 22865 + }, + { + "epoch": 13.484669811320755, + "grad_norm": 3.11784029006958, + "learning_rate": 2.899507854361537e-06, + "loss": 0.2298, + "num_input_tokens_seen": 14934376, + "step": 22870 + }, + { + "epoch": 13.487617924528301, + "grad_norm": 3.128655195236206, + "learning_rate": 2.8971734496644975e-06, + "loss": 0.3315, + "num_input_tokens_seen": 14937064, + "step": 22875 + }, + { + "epoch": 13.49056603773585, + "grad_norm": 3.965510368347168, + "learning_rate": 2.894839601699851e-06, + "loss": 0.4637, + "num_input_tokens_seen": 14940616, + "step": 22880 + }, + { + "epoch": 13.493514150943396, + "grad_norm": 2.761537551879883, + "learning_rate": 2.8925063110854923e-06, + "loss": 0.361, + "num_input_tokens_seen": 14944968, + "step": 22885 + }, + { + "epoch": 13.496462264150944, + "grad_norm": 5.435567855834961, + "learning_rate": 2.8901735784391683e-06, + "loss": 0.4699, + "num_input_tokens_seen": 14948488, + "step": 22890 + }, + { + "epoch": 13.49941037735849, + "grad_norm": 3.0014123916625977, + "learning_rate": 2.8878414043784844e-06, + "loss": 0.4044, + "num_input_tokens_seen": 14951176, + "step": 22895 + }, + { + "epoch": 13.502358490566039, + "grad_norm": 3.4619250297546387, + "learning_rate": 2.885509789520891e-06, + "loss": 0.2662, + "num_input_tokens_seen": 14953448, + "step": 22900 + }, + { + "epoch": 13.505306603773585, + "grad_norm": 4.056725025177002, + "learning_rate": 2.8831787344836926e-06, + "loss": 0.3374, + "num_input_tokens_seen": 14956232, + "step": 22905 + }, + { + "epoch": 13.508254716981131, + "grad_norm": 2.493910789489746, + "learning_rate": 2.880848239884049e-06, + "loss": 0.3452, + "num_input_tokens_seen": 14960072, + "step": 22910 + }, + { + "epoch": 13.51120283018868, + "grad_norm": 1.4840056896209717, + "learning_rate": 2.8785183063389667e-06, + "loss": 0.2982, + "num_input_tokens_seen": 14963944, + "step": 22915 + }, + { + "epoch": 13.514150943396226, + "grad_norm": 5.421234130859375, + "learning_rate": 2.876188934465306e-06, + "loss": 0.3902, + "num_input_tokens_seen": 14966888, + "step": 22920 + }, + { + "epoch": 13.517099056603774, + "grad_norm": 5.5362548828125, + "learning_rate": 2.8738601248797758e-06, + "loss": 0.3042, + "num_input_tokens_seen": 14969992, + "step": 22925 + }, + { + "epoch": 13.52004716981132, + "grad_norm": 7.7347941398620605, + "learning_rate": 2.8715318781989432e-06, + "loss": 0.4694, + "num_input_tokens_seen": 14972808, + "step": 22930 + }, + { + "epoch": 13.522995283018869, + "grad_norm": 1.5919901132583618, + "learning_rate": 2.869204195039219e-06, + "loss": 0.2504, + "num_input_tokens_seen": 14976264, + "step": 22935 + }, + { + "epoch": 13.525943396226415, + "grad_norm": 3.2556636333465576, + "learning_rate": 2.8668770760168673e-06, + "loss": 0.3181, + "num_input_tokens_seen": 14979912, + "step": 22940 + }, + { + "epoch": 13.528891509433961, + "grad_norm": 3.1158010959625244, + "learning_rate": 2.864550521748003e-06, + "loss": 0.343, + "num_input_tokens_seen": 14983176, + "step": 22945 + }, + { + "epoch": 13.53183962264151, + "grad_norm": 1.6123318672180176, + "learning_rate": 2.862224532848591e-06, + "loss": 0.2512, + "num_input_tokens_seen": 14986184, + "step": 22950 + }, + { + "epoch": 13.534787735849056, + "grad_norm": 3.5667927265167236, + "learning_rate": 2.8598991099344455e-06, + "loss": 0.3083, + "num_input_tokens_seen": 14989832, + "step": 22955 + }, + { + "epoch": 13.537735849056604, + "grad_norm": 4.33881139755249, + "learning_rate": 2.857574253621236e-06, + "loss": 0.4403, + "num_input_tokens_seen": 14992616, + "step": 22960 + }, + { + "epoch": 13.54068396226415, + "grad_norm": 2.4982235431671143, + "learning_rate": 2.855249964524476e-06, + "loss": 0.4873, + "num_input_tokens_seen": 14996424, + "step": 22965 + }, + { + "epoch": 13.543632075471699, + "grad_norm": 5.1585469245910645, + "learning_rate": 2.852926243259531e-06, + "loss": 0.3503, + "num_input_tokens_seen": 14999400, + "step": 22970 + }, + { + "epoch": 13.546580188679245, + "grad_norm": 2.7232210636138916, + "learning_rate": 2.850603090441617e-06, + "loss": 0.3814, + "num_input_tokens_seen": 15001928, + "step": 22975 + }, + { + "epoch": 13.549528301886792, + "grad_norm": 5.4202094078063965, + "learning_rate": 2.848280506685798e-06, + "loss": 0.2914, + "num_input_tokens_seen": 15005256, + "step": 22980 + }, + { + "epoch": 13.55247641509434, + "grad_norm": 3.735093355178833, + "learning_rate": 2.845958492606986e-06, + "loss": 0.3507, + "num_input_tokens_seen": 15009256, + "step": 22985 + }, + { + "epoch": 13.555424528301886, + "grad_norm": 2.696908950805664, + "learning_rate": 2.843637048819949e-06, + "loss": 0.2759, + "num_input_tokens_seen": 15012104, + "step": 22990 + }, + { + "epoch": 13.558372641509434, + "grad_norm": 3.6791319847106934, + "learning_rate": 2.8413161759392966e-06, + "loss": 0.4352, + "num_input_tokens_seen": 15015176, + "step": 22995 + }, + { + "epoch": 13.56132075471698, + "grad_norm": 2.4970524311065674, + "learning_rate": 2.8389958745794878e-06, + "loss": 0.3515, + "num_input_tokens_seen": 15018920, + "step": 23000 + }, + { + "epoch": 13.564268867924529, + "grad_norm": 2.128286123275757, + "learning_rate": 2.8366761453548366e-06, + "loss": 0.3874, + "num_input_tokens_seen": 15022536, + "step": 23005 + }, + { + "epoch": 13.567216981132075, + "grad_norm": 3.6343510150909424, + "learning_rate": 2.8343569888795e-06, + "loss": 0.3192, + "num_input_tokens_seen": 15025576, + "step": 23010 + }, + { + "epoch": 13.570165094339622, + "grad_norm": 4.324986934661865, + "learning_rate": 2.832038405767483e-06, + "loss": 0.4975, + "num_input_tokens_seen": 15032936, + "step": 23015 + }, + { + "epoch": 13.57311320754717, + "grad_norm": 10.925804138183594, + "learning_rate": 2.8297203966326397e-06, + "loss": 0.4222, + "num_input_tokens_seen": 15035336, + "step": 23020 + }, + { + "epoch": 13.576061320754716, + "grad_norm": 5.401839256286621, + "learning_rate": 2.8274029620886773e-06, + "loss": 0.3654, + "num_input_tokens_seen": 15038984, + "step": 23025 + }, + { + "epoch": 13.579009433962264, + "grad_norm": 2.814840316772461, + "learning_rate": 2.825086102749144e-06, + "loss": 0.3568, + "num_input_tokens_seen": 15042696, + "step": 23030 + }, + { + "epoch": 13.58195754716981, + "grad_norm": 2.7198328971862793, + "learning_rate": 2.822769819227438e-06, + "loss": 0.3471, + "num_input_tokens_seen": 15045512, + "step": 23035 + }, + { + "epoch": 13.584905660377359, + "grad_norm": 5.235401153564453, + "learning_rate": 2.8204541121368055e-06, + "loss": 0.3416, + "num_input_tokens_seen": 15048520, + "step": 23040 + }, + { + "epoch": 13.587853773584905, + "grad_norm": 2.7315614223480225, + "learning_rate": 2.8181389820903402e-06, + "loss": 0.2729, + "num_input_tokens_seen": 15051720, + "step": 23045 + }, + { + "epoch": 13.590801886792454, + "grad_norm": 4.099570274353027, + "learning_rate": 2.8158244297009814e-06, + "loss": 0.3268, + "num_input_tokens_seen": 15055304, + "step": 23050 + }, + { + "epoch": 13.59375, + "grad_norm": 3.6875975131988525, + "learning_rate": 2.8135104555815196e-06, + "loss": 0.4602, + "num_input_tokens_seen": 15058184, + "step": 23055 + }, + { + "epoch": 13.596698113207546, + "grad_norm": 2.273271083831787, + "learning_rate": 2.811197060344588e-06, + "loss": 0.2435, + "num_input_tokens_seen": 15061288, + "step": 23060 + }, + { + "epoch": 13.599646226415095, + "grad_norm": 2.665348768234253, + "learning_rate": 2.8088842446026677e-06, + "loss": 0.2613, + "num_input_tokens_seen": 15064488, + "step": 23065 + }, + { + "epoch": 13.602594339622641, + "grad_norm": 7.053226470947266, + "learning_rate": 2.806572008968087e-06, + "loss": 0.2765, + "num_input_tokens_seen": 15068680, + "step": 23070 + }, + { + "epoch": 13.60554245283019, + "grad_norm": 2.7762746810913086, + "learning_rate": 2.80426035405302e-06, + "loss": 0.2525, + "num_input_tokens_seen": 15071784, + "step": 23075 + }, + { + "epoch": 13.608490566037736, + "grad_norm": 2.9505319595336914, + "learning_rate": 2.8019492804694852e-06, + "loss": 0.2494, + "num_input_tokens_seen": 15074824, + "step": 23080 + }, + { + "epoch": 13.611438679245284, + "grad_norm": 3.0616211891174316, + "learning_rate": 2.799638788829354e-06, + "loss": 0.304, + "num_input_tokens_seen": 15077768, + "step": 23085 + }, + { + "epoch": 13.61438679245283, + "grad_norm": 2.239469051361084, + "learning_rate": 2.7973288797443367e-06, + "loss": 0.2506, + "num_input_tokens_seen": 15080712, + "step": 23090 + }, + { + "epoch": 13.617334905660378, + "grad_norm": 2.850687026977539, + "learning_rate": 2.7950195538259884e-06, + "loss": 0.2947, + "num_input_tokens_seen": 15083560, + "step": 23095 + }, + { + "epoch": 13.620283018867925, + "grad_norm": 3.485285520553589, + "learning_rate": 2.792710811685719e-06, + "loss": 0.2852, + "num_input_tokens_seen": 15087240, + "step": 23100 + }, + { + "epoch": 13.623231132075471, + "grad_norm": 2.953608989715576, + "learning_rate": 2.7904026539347743e-06, + "loss": 0.2911, + "num_input_tokens_seen": 15090440, + "step": 23105 + }, + { + "epoch": 13.62617924528302, + "grad_norm": 3.5990209579467773, + "learning_rate": 2.7880950811842507e-06, + "loss": 0.4361, + "num_input_tokens_seen": 15093992, + "step": 23110 + }, + { + "epoch": 13.629127358490566, + "grad_norm": 5.496184349060059, + "learning_rate": 2.785788094045085e-06, + "loss": 0.4104, + "num_input_tokens_seen": 15096904, + "step": 23115 + }, + { + "epoch": 13.632075471698114, + "grad_norm": 3.197831630706787, + "learning_rate": 2.7834816931280655e-06, + "loss": 0.2685, + "num_input_tokens_seen": 15100168, + "step": 23120 + }, + { + "epoch": 13.63502358490566, + "grad_norm": 4.219309329986572, + "learning_rate": 2.781175879043821e-06, + "loss": 0.4806, + "num_input_tokens_seen": 15102728, + "step": 23125 + }, + { + "epoch": 13.637971698113208, + "grad_norm": 4.521852970123291, + "learning_rate": 2.778870652402825e-06, + "loss": 0.3423, + "num_input_tokens_seen": 15105544, + "step": 23130 + }, + { + "epoch": 13.640919811320755, + "grad_norm": 2.232077121734619, + "learning_rate": 2.776566013815396e-06, + "loss": 0.3124, + "num_input_tokens_seen": 15109096, + "step": 23135 + }, + { + "epoch": 13.643867924528301, + "grad_norm": 3.8576509952545166, + "learning_rate": 2.774261963891698e-06, + "loss": 0.3938, + "num_input_tokens_seen": 15112840, + "step": 23140 + }, + { + "epoch": 13.64681603773585, + "grad_norm": 3.055460214614868, + "learning_rate": 2.771958503241735e-06, + "loss": 0.3108, + "num_input_tokens_seen": 15116200, + "step": 23145 + }, + { + "epoch": 13.649764150943396, + "grad_norm": 3.8606085777282715, + "learning_rate": 2.769655632475362e-06, + "loss": 0.4542, + "num_input_tokens_seen": 15118728, + "step": 23150 + }, + { + "epoch": 13.652712264150944, + "grad_norm": 3.8641085624694824, + "learning_rate": 2.7673533522022733e-06, + "loss": 0.3012, + "num_input_tokens_seen": 15121896, + "step": 23155 + }, + { + "epoch": 13.65566037735849, + "grad_norm": 3.082353115081787, + "learning_rate": 2.765051663032007e-06, + "loss": 0.3006, + "num_input_tokens_seen": 15124232, + "step": 23160 + }, + { + "epoch": 13.658608490566039, + "grad_norm": 4.670108318328857, + "learning_rate": 2.7627505655739446e-06, + "loss": 0.249, + "num_input_tokens_seen": 15126984, + "step": 23165 + }, + { + "epoch": 13.661556603773585, + "grad_norm": 4.433457374572754, + "learning_rate": 2.7604500604373097e-06, + "loss": 0.391, + "num_input_tokens_seen": 15130664, + "step": 23170 + }, + { + "epoch": 13.664504716981131, + "grad_norm": 2.0076403617858887, + "learning_rate": 2.7581501482311757e-06, + "loss": 0.3589, + "num_input_tokens_seen": 15133320, + "step": 23175 + }, + { + "epoch": 13.66745283018868, + "grad_norm": 2.8566620349884033, + "learning_rate": 2.7558508295644513e-06, + "loss": 0.2966, + "num_input_tokens_seen": 15136552, + "step": 23180 + }, + { + "epoch": 13.670400943396226, + "grad_norm": 4.344300270080566, + "learning_rate": 2.7535521050458922e-06, + "loss": 0.2632, + "num_input_tokens_seen": 15143208, + "step": 23185 + }, + { + "epoch": 13.673349056603774, + "grad_norm": 3.4346835613250732, + "learning_rate": 2.7512539752840926e-06, + "loss": 0.2723, + "num_input_tokens_seen": 15147240, + "step": 23190 + }, + { + "epoch": 13.67629716981132, + "grad_norm": 3.64217209815979, + "learning_rate": 2.748956440887497e-06, + "loss": 0.4407, + "num_input_tokens_seen": 15149736, + "step": 23195 + }, + { + "epoch": 13.679245283018869, + "grad_norm": 6.360025882720947, + "learning_rate": 2.7466595024643843e-06, + "loss": 0.3704, + "num_input_tokens_seen": 15152872, + "step": 23200 + }, + { + "epoch": 13.682193396226415, + "grad_norm": 15.878799438476562, + "learning_rate": 2.744363160622878e-06, + "loss": 0.4685, + "num_input_tokens_seen": 15155752, + "step": 23205 + }, + { + "epoch": 13.685141509433961, + "grad_norm": 2.9222614765167236, + "learning_rate": 2.742067415970948e-06, + "loss": 0.3259, + "num_input_tokens_seen": 15158600, + "step": 23210 + }, + { + "epoch": 13.68808962264151, + "grad_norm": 4.539727210998535, + "learning_rate": 2.739772269116402e-06, + "loss": 0.3177, + "num_input_tokens_seen": 15161608, + "step": 23215 + }, + { + "epoch": 13.691037735849056, + "grad_norm": 2.921752452850342, + "learning_rate": 2.7374777206668874e-06, + "loss": 0.3805, + "num_input_tokens_seen": 15164232, + "step": 23220 + }, + { + "epoch": 13.693985849056604, + "grad_norm": 8.450603485107422, + "learning_rate": 2.735183771229898e-06, + "loss": 0.4193, + "num_input_tokens_seen": 15167240, + "step": 23225 + }, + { + "epoch": 13.69693396226415, + "grad_norm": 2.2667641639709473, + "learning_rate": 2.732890421412765e-06, + "loss": 0.2843, + "num_input_tokens_seen": 15170568, + "step": 23230 + }, + { + "epoch": 13.699882075471699, + "grad_norm": 2.742682695388794, + "learning_rate": 2.7305976718226624e-06, + "loss": 0.5135, + "num_input_tokens_seen": 15175144, + "step": 23235 + }, + { + "epoch": 13.702830188679245, + "grad_norm": 3.1275877952575684, + "learning_rate": 2.728305523066609e-06, + "loss": 0.3248, + "num_input_tokens_seen": 15178664, + "step": 23240 + }, + { + "epoch": 13.705778301886792, + "grad_norm": 4.941312789916992, + "learning_rate": 2.726013975751458e-06, + "loss": 0.2745, + "num_input_tokens_seen": 15181160, + "step": 23245 + }, + { + "epoch": 13.70872641509434, + "grad_norm": 2.426234722137451, + "learning_rate": 2.723723030483908e-06, + "loss": 0.3392, + "num_input_tokens_seen": 15184392, + "step": 23250 + }, + { + "epoch": 13.711674528301886, + "grad_norm": 3.2399232387542725, + "learning_rate": 2.7214326878704953e-06, + "loss": 0.3636, + "num_input_tokens_seen": 15188520, + "step": 23255 + }, + { + "epoch": 13.714622641509434, + "grad_norm": 3.921346426010132, + "learning_rate": 2.7191429485175993e-06, + "loss": 0.3898, + "num_input_tokens_seen": 15191016, + "step": 23260 + }, + { + "epoch": 13.71757075471698, + "grad_norm": 2.3347933292388916, + "learning_rate": 2.716853813031435e-06, + "loss": 0.2562, + "num_input_tokens_seen": 15194248, + "step": 23265 + }, + { + "epoch": 13.720518867924529, + "grad_norm": 2.8732569217681885, + "learning_rate": 2.714565282018066e-06, + "loss": 0.4399, + "num_input_tokens_seen": 15197288, + "step": 23270 + }, + { + "epoch": 13.723466981132075, + "grad_norm": 3.814807176589966, + "learning_rate": 2.7122773560833877e-06, + "loss": 0.2981, + "num_input_tokens_seen": 15200904, + "step": 23275 + }, + { + "epoch": 13.726415094339622, + "grad_norm": 3.655446767807007, + "learning_rate": 2.709990035833139e-06, + "loss": 0.2533, + "num_input_tokens_seen": 15204072, + "step": 23280 + }, + { + "epoch": 13.72936320754717, + "grad_norm": 2.7474193572998047, + "learning_rate": 2.707703321872896e-06, + "loss": 0.4192, + "num_input_tokens_seen": 15207144, + "step": 23285 + }, + { + "epoch": 13.732311320754716, + "grad_norm": 3.4528629779815674, + "learning_rate": 2.705417214808079e-06, + "loss": 0.3169, + "num_input_tokens_seen": 15209832, + "step": 23290 + }, + { + "epoch": 13.735259433962264, + "grad_norm": 2.9516336917877197, + "learning_rate": 2.703131715243945e-06, + "loss": 0.3322, + "num_input_tokens_seen": 15212648, + "step": 23295 + }, + { + "epoch": 13.73820754716981, + "grad_norm": 4.275572299957275, + "learning_rate": 2.7008468237855855e-06, + "loss": 0.278, + "num_input_tokens_seen": 15216168, + "step": 23300 + }, + { + "epoch": 13.741155660377359, + "grad_norm": 4.126182556152344, + "learning_rate": 2.69856254103794e-06, + "loss": 0.3287, + "num_input_tokens_seen": 15218888, + "step": 23305 + }, + { + "epoch": 13.744103773584905, + "grad_norm": 2.8749279975891113, + "learning_rate": 2.6962788676057806e-06, + "loss": 0.3739, + "num_input_tokens_seen": 15221672, + "step": 23310 + }, + { + "epoch": 13.747051886792454, + "grad_norm": 2.8239290714263916, + "learning_rate": 2.69399580409372e-06, + "loss": 0.3131, + "num_input_tokens_seen": 15225352, + "step": 23315 + }, + { + "epoch": 13.75, + "grad_norm": 2.651754379272461, + "learning_rate": 2.6917133511062076e-06, + "loss": 0.3032, + "num_input_tokens_seen": 15228264, + "step": 23320 + }, + { + "epoch": 13.752948113207546, + "grad_norm": 4.78381872177124, + "learning_rate": 2.6894315092475342e-06, + "loss": 0.4379, + "num_input_tokens_seen": 15231560, + "step": 23325 + }, + { + "epoch": 13.755896226415095, + "grad_norm": 4.6431684494018555, + "learning_rate": 2.6871502791218245e-06, + "loss": 0.2724, + "num_input_tokens_seen": 15237192, + "step": 23330 + }, + { + "epoch": 13.758844339622641, + "grad_norm": 3.291208028793335, + "learning_rate": 2.684869661333048e-06, + "loss": 0.4597, + "num_input_tokens_seen": 15240200, + "step": 23335 + }, + { + "epoch": 13.76179245283019, + "grad_norm": 3.223240852355957, + "learning_rate": 2.6825896564850074e-06, + "loss": 0.3341, + "num_input_tokens_seen": 15243528, + "step": 23340 + }, + { + "epoch": 13.764740566037736, + "grad_norm": 5.603666305541992, + "learning_rate": 2.6803102651813416e-06, + "loss": 0.3714, + "num_input_tokens_seen": 15246120, + "step": 23345 + }, + { + "epoch": 13.767688679245284, + "grad_norm": 2.7316973209381104, + "learning_rate": 2.6780314880255307e-06, + "loss": 0.2401, + "num_input_tokens_seen": 15249896, + "step": 23350 + }, + { + "epoch": 13.77063679245283, + "grad_norm": 3.015049934387207, + "learning_rate": 2.675753325620891e-06, + "loss": 0.3649, + "num_input_tokens_seen": 15253576, + "step": 23355 + }, + { + "epoch": 13.773584905660378, + "grad_norm": 3.319258451461792, + "learning_rate": 2.6734757785705727e-06, + "loss": 0.3845, + "num_input_tokens_seen": 15257320, + "step": 23360 + }, + { + "epoch": 13.776533018867925, + "grad_norm": 2.7273547649383545, + "learning_rate": 2.6711988474775712e-06, + "loss": 0.4991, + "num_input_tokens_seen": 15260872, + "step": 23365 + }, + { + "epoch": 13.779481132075471, + "grad_norm": 2.7415835857391357, + "learning_rate": 2.668922532944711e-06, + "loss": 0.2537, + "num_input_tokens_seen": 15263976, + "step": 23370 + }, + { + "epoch": 13.78242924528302, + "grad_norm": 3.642705202102661, + "learning_rate": 2.6666468355746566e-06, + "loss": 0.4049, + "num_input_tokens_seen": 15267336, + "step": 23375 + }, + { + "epoch": 13.785377358490566, + "grad_norm": 2.088280200958252, + "learning_rate": 2.6643717559699073e-06, + "loss": 0.3734, + "num_input_tokens_seen": 15270312, + "step": 23380 + }, + { + "epoch": 13.788325471698114, + "grad_norm": 3.0473570823669434, + "learning_rate": 2.662097294732803e-06, + "loss": 0.3298, + "num_input_tokens_seen": 15273192, + "step": 23385 + }, + { + "epoch": 13.79127358490566, + "grad_norm": 3.747938632965088, + "learning_rate": 2.6598234524655165e-06, + "loss": 0.3202, + "num_input_tokens_seen": 15275624, + "step": 23390 + }, + { + "epoch": 13.794221698113208, + "grad_norm": 2.1452693939208984, + "learning_rate": 2.657550229770054e-06, + "loss": 0.3269, + "num_input_tokens_seen": 15278760, + "step": 23395 + }, + { + "epoch": 13.797169811320755, + "grad_norm": 3.285217046737671, + "learning_rate": 2.655277627248265e-06, + "loss": 0.2838, + "num_input_tokens_seen": 15283080, + "step": 23400 + }, + { + "epoch": 13.800117924528301, + "grad_norm": 3.3791892528533936, + "learning_rate": 2.65300564550183e-06, + "loss": 0.3196, + "num_input_tokens_seen": 15285768, + "step": 23405 + }, + { + "epoch": 13.80306603773585, + "grad_norm": 4.2436089515686035, + "learning_rate": 2.6507342851322647e-06, + "loss": 0.3361, + "num_input_tokens_seen": 15288616, + "step": 23410 + }, + { + "epoch": 13.806014150943396, + "grad_norm": 4.9022932052612305, + "learning_rate": 2.6484635467409233e-06, + "loss": 0.3475, + "num_input_tokens_seen": 15292264, + "step": 23415 + }, + { + "epoch": 13.808962264150944, + "grad_norm": 3.644580841064453, + "learning_rate": 2.64619343092899e-06, + "loss": 0.3056, + "num_input_tokens_seen": 15294664, + "step": 23420 + }, + { + "epoch": 13.81191037735849, + "grad_norm": 4.194464683532715, + "learning_rate": 2.643923938297492e-06, + "loss": 0.3491, + "num_input_tokens_seen": 15297288, + "step": 23425 + }, + { + "epoch": 13.814858490566039, + "grad_norm": 4.232224941253662, + "learning_rate": 2.6416550694472855e-06, + "loss": 0.2772, + "num_input_tokens_seen": 15300328, + "step": 23430 + }, + { + "epoch": 13.817806603773585, + "grad_norm": 5.802033424377441, + "learning_rate": 2.639386824979063e-06, + "loss": 0.3236, + "num_input_tokens_seen": 15303720, + "step": 23435 + }, + { + "epoch": 13.820754716981131, + "grad_norm": 4.994040489196777, + "learning_rate": 2.6371192054933525e-06, + "loss": 0.3114, + "num_input_tokens_seen": 15306824, + "step": 23440 + }, + { + "epoch": 13.82370283018868, + "grad_norm": 4.669008731842041, + "learning_rate": 2.634852211590516e-06, + "loss": 0.3862, + "num_input_tokens_seen": 15310472, + "step": 23445 + }, + { + "epoch": 13.826650943396226, + "grad_norm": 3.0111196041107178, + "learning_rate": 2.6325858438707473e-06, + "loss": 0.3303, + "num_input_tokens_seen": 15313896, + "step": 23450 + }, + { + "epoch": 13.829599056603774, + "grad_norm": 2.1155176162719727, + "learning_rate": 2.630320102934082e-06, + "loss": 0.3356, + "num_input_tokens_seen": 15316712, + "step": 23455 + }, + { + "epoch": 13.83254716981132, + "grad_norm": 2.2951526641845703, + "learning_rate": 2.628054989380382e-06, + "loss": 0.2678, + "num_input_tokens_seen": 15320360, + "step": 23460 + }, + { + "epoch": 13.835495283018869, + "grad_norm": 4.330511093139648, + "learning_rate": 2.625790503809346e-06, + "loss": 0.4571, + "num_input_tokens_seen": 15322952, + "step": 23465 + }, + { + "epoch": 13.838443396226415, + "grad_norm": 2.4139559268951416, + "learning_rate": 2.6235266468205067e-06, + "loss": 0.3216, + "num_input_tokens_seen": 15327080, + "step": 23470 + }, + { + "epoch": 13.841391509433961, + "grad_norm": 5.090373992919922, + "learning_rate": 2.621263419013227e-06, + "loss": 0.4281, + "num_input_tokens_seen": 15329640, + "step": 23475 + }, + { + "epoch": 13.84433962264151, + "grad_norm": 5.41148567199707, + "learning_rate": 2.619000820986711e-06, + "loss": 0.2933, + "num_input_tokens_seen": 15332232, + "step": 23480 + }, + { + "epoch": 13.847287735849056, + "grad_norm": 2.7556118965148926, + "learning_rate": 2.616738853339988e-06, + "loss": 0.3308, + "num_input_tokens_seen": 15335560, + "step": 23485 + }, + { + "epoch": 13.850235849056604, + "grad_norm": 3.5513179302215576, + "learning_rate": 2.614477516671926e-06, + "loss": 0.3435, + "num_input_tokens_seen": 15338536, + "step": 23490 + }, + { + "epoch": 13.85318396226415, + "grad_norm": 8.474603652954102, + "learning_rate": 2.612216811581223e-06, + "loss": 0.3345, + "num_input_tokens_seen": 15342216, + "step": 23495 + }, + { + "epoch": 13.856132075471699, + "grad_norm": 5.50743293762207, + "learning_rate": 2.6099567386664095e-06, + "loss": 0.4646, + "num_input_tokens_seen": 15345544, + "step": 23500 + }, + { + "epoch": 13.859080188679245, + "grad_norm": 3.7315688133239746, + "learning_rate": 2.60769729852585e-06, + "loss": 0.3374, + "num_input_tokens_seen": 15348296, + "step": 23505 + }, + { + "epoch": 13.862028301886792, + "grad_norm": 5.208621501922607, + "learning_rate": 2.6054384917577413e-06, + "loss": 0.2977, + "num_input_tokens_seen": 15351560, + "step": 23510 + }, + { + "epoch": 13.86497641509434, + "grad_norm": 2.850694417953491, + "learning_rate": 2.60318031896011e-06, + "loss": 0.3796, + "num_input_tokens_seen": 15354600, + "step": 23515 + }, + { + "epoch": 13.867924528301886, + "grad_norm": 9.681855201721191, + "learning_rate": 2.60092278073082e-06, + "loss": 0.3804, + "num_input_tokens_seen": 15357352, + "step": 23520 + }, + { + "epoch": 13.870872641509434, + "grad_norm": 7.315431118011475, + "learning_rate": 2.5986658776675644e-06, + "loss": 0.5134, + "num_input_tokens_seen": 15360872, + "step": 23525 + }, + { + "epoch": 13.87382075471698, + "grad_norm": 2.3097472190856934, + "learning_rate": 2.5964096103678666e-06, + "loss": 0.254, + "num_input_tokens_seen": 15363496, + "step": 23530 + }, + { + "epoch": 13.876768867924529, + "grad_norm": 3.9536099433898926, + "learning_rate": 2.5941539794290833e-06, + "loss": 0.3577, + "num_input_tokens_seen": 15366568, + "step": 23535 + }, + { + "epoch": 13.879716981132075, + "grad_norm": 4.6044840812683105, + "learning_rate": 2.5918989854484024e-06, + "loss": 0.3424, + "num_input_tokens_seen": 15370248, + "step": 23540 + }, + { + "epoch": 13.882665094339622, + "grad_norm": 3.61196231842041, + "learning_rate": 2.5896446290228417e-06, + "loss": 0.2838, + "num_input_tokens_seen": 15373352, + "step": 23545 + }, + { + "epoch": 13.88561320754717, + "grad_norm": 2.695436954498291, + "learning_rate": 2.5873909107492547e-06, + "loss": 0.2772, + "num_input_tokens_seen": 15376552, + "step": 23550 + }, + { + "epoch": 13.888561320754716, + "grad_norm": 3.637519359588623, + "learning_rate": 2.5851378312243224e-06, + "loss": 0.4325, + "num_input_tokens_seen": 15380232, + "step": 23555 + }, + { + "epoch": 13.891509433962264, + "grad_norm": 2.0695247650146484, + "learning_rate": 2.5828853910445572e-06, + "loss": 0.3464, + "num_input_tokens_seen": 15383592, + "step": 23560 + }, + { + "epoch": 13.89445754716981, + "grad_norm": 5.051840305328369, + "learning_rate": 2.5806335908063012e-06, + "loss": 0.3864, + "num_input_tokens_seen": 15385768, + "step": 23565 + }, + { + "epoch": 13.897405660377359, + "grad_norm": 3.7241151332855225, + "learning_rate": 2.5783824311057293e-06, + "loss": 0.4094, + "num_input_tokens_seen": 15389032, + "step": 23570 + }, + { + "epoch": 13.900353773584905, + "grad_norm": 3.376349925994873, + "learning_rate": 2.5761319125388433e-06, + "loss": 0.3565, + "num_input_tokens_seen": 15393000, + "step": 23575 + }, + { + "epoch": 13.903301886792454, + "grad_norm": 4.668268203735352, + "learning_rate": 2.57388203570148e-06, + "loss": 0.3611, + "num_input_tokens_seen": 15396648, + "step": 23580 + }, + { + "epoch": 13.90625, + "grad_norm": 4.283264636993408, + "learning_rate": 2.5716328011893055e-06, + "loss": 0.4393, + "num_input_tokens_seen": 15399496, + "step": 23585 + }, + { + "epoch": 13.909198113207546, + "grad_norm": 2.6156091690063477, + "learning_rate": 2.5693842095978127e-06, + "loss": 0.3381, + "num_input_tokens_seen": 15402504, + "step": 23590 + }, + { + "epoch": 13.912146226415095, + "grad_norm": 4.564388751983643, + "learning_rate": 2.567136261522325e-06, + "loss": 0.2975, + "num_input_tokens_seen": 15405896, + "step": 23595 + }, + { + "epoch": 13.915094339622641, + "grad_norm": 3.2943637371063232, + "learning_rate": 2.5648889575579985e-06, + "loss": 0.3256, + "num_input_tokens_seen": 15409640, + "step": 23600 + }, + { + "epoch": 13.91804245283019, + "grad_norm": 2.9994988441467285, + "learning_rate": 2.562642298299814e-06, + "loss": 0.3153, + "num_input_tokens_seen": 15412520, + "step": 23605 + }, + { + "epoch": 13.920990566037736, + "grad_norm": 4.007449150085449, + "learning_rate": 2.560396284342584e-06, + "loss": 0.4431, + "num_input_tokens_seen": 15415560, + "step": 23610 + }, + { + "epoch": 13.923938679245284, + "grad_norm": 3.6312448978424072, + "learning_rate": 2.558150916280954e-06, + "loss": 0.3398, + "num_input_tokens_seen": 15418216, + "step": 23615 + }, + { + "epoch": 13.92688679245283, + "grad_norm": 4.2417449951171875, + "learning_rate": 2.555906194709392e-06, + "loss": 0.3278, + "num_input_tokens_seen": 15421512, + "step": 23620 + }, + { + "epoch": 13.929834905660378, + "grad_norm": 4.329840183258057, + "learning_rate": 2.553662120222199e-06, + "loss": 0.2963, + "num_input_tokens_seen": 15424712, + "step": 23625 + }, + { + "epoch": 13.932783018867925, + "grad_norm": 5.724375247955322, + "learning_rate": 2.5514186934135026e-06, + "loss": 0.3759, + "num_input_tokens_seen": 15428360, + "step": 23630 + }, + { + "epoch": 13.935731132075471, + "grad_norm": 1.7934311628341675, + "learning_rate": 2.54917591487726e-06, + "loss": 0.2503, + "num_input_tokens_seen": 15432392, + "step": 23635 + }, + { + "epoch": 13.93867924528302, + "grad_norm": 7.453405380249023, + "learning_rate": 2.5469337852072547e-06, + "loss": 0.3489, + "num_input_tokens_seen": 15436136, + "step": 23640 + }, + { + "epoch": 13.941627358490566, + "grad_norm": 4.376336574554443, + "learning_rate": 2.5446923049971035e-06, + "loss": 0.4321, + "num_input_tokens_seen": 15438792, + "step": 23645 + }, + { + "epoch": 13.944575471698114, + "grad_norm": 2.8678040504455566, + "learning_rate": 2.5424514748402463e-06, + "loss": 0.2789, + "num_input_tokens_seen": 15441352, + "step": 23650 + }, + { + "epoch": 13.94752358490566, + "grad_norm": 4.922937870025635, + "learning_rate": 2.540211295329953e-06, + "loss": 0.3534, + "num_input_tokens_seen": 15443720, + "step": 23655 + }, + { + "epoch": 13.950471698113208, + "grad_norm": 3.932076930999756, + "learning_rate": 2.5379717670593197e-06, + "loss": 0.3427, + "num_input_tokens_seen": 15447016, + "step": 23660 + }, + { + "epoch": 13.953419811320755, + "grad_norm": 3.239251136779785, + "learning_rate": 2.53573289062127e-06, + "loss": 0.3692, + "num_input_tokens_seen": 15450024, + "step": 23665 + }, + { + "epoch": 13.956367924528301, + "grad_norm": 3.143324851989746, + "learning_rate": 2.5334946666085605e-06, + "loss": 0.3301, + "num_input_tokens_seen": 15453576, + "step": 23670 + }, + { + "epoch": 13.95931603773585, + "grad_norm": 3.1578140258789062, + "learning_rate": 2.531257095613766e-06, + "loss": 0.2432, + "num_input_tokens_seen": 15457480, + "step": 23675 + }, + { + "epoch": 13.962264150943396, + "grad_norm": 3.1515250205993652, + "learning_rate": 2.529020178229297e-06, + "loss": 0.278, + "num_input_tokens_seen": 15461000, + "step": 23680 + }, + { + "epoch": 13.965212264150944, + "grad_norm": 3.1113359928131104, + "learning_rate": 2.5267839150473846e-06, + "loss": 0.3695, + "num_input_tokens_seen": 15464712, + "step": 23685 + }, + { + "epoch": 13.96816037735849, + "grad_norm": 4.312508583068848, + "learning_rate": 2.5245483066600896e-06, + "loss": 0.4236, + "num_input_tokens_seen": 15468648, + "step": 23690 + }, + { + "epoch": 13.971108490566039, + "grad_norm": 5.291144847869873, + "learning_rate": 2.5223133536592996e-06, + "loss": 0.2932, + "num_input_tokens_seen": 15471240, + "step": 23695 + }, + { + "epoch": 13.974056603773585, + "grad_norm": 3.8239948749542236, + "learning_rate": 2.520079056636725e-06, + "loss": 0.308, + "num_input_tokens_seen": 15474824, + "step": 23700 + }, + { + "epoch": 13.977004716981131, + "grad_norm": 4.0720295906066895, + "learning_rate": 2.5178454161839106e-06, + "loss": 0.2688, + "num_input_tokens_seen": 15477608, + "step": 23705 + }, + { + "epoch": 13.97995283018868, + "grad_norm": 2.5558931827545166, + "learning_rate": 2.5156124328922195e-06, + "loss": 0.3269, + "num_input_tokens_seen": 15480872, + "step": 23710 + }, + { + "epoch": 13.982900943396226, + "grad_norm": 7.178508281707764, + "learning_rate": 2.513380107352844e-06, + "loss": 0.3005, + "num_input_tokens_seen": 15483656, + "step": 23715 + }, + { + "epoch": 13.985849056603774, + "grad_norm": 2.667046546936035, + "learning_rate": 2.5111484401568014e-06, + "loss": 0.2763, + "num_input_tokens_seen": 15487016, + "step": 23720 + }, + { + "epoch": 13.98879716981132, + "grad_norm": 2.7007172107696533, + "learning_rate": 2.508917431894936e-06, + "loss": 0.3378, + "num_input_tokens_seen": 15489992, + "step": 23725 + }, + { + "epoch": 13.991745283018869, + "grad_norm": 2.756859540939331, + "learning_rate": 2.5066870831579144e-06, + "loss": 0.2797, + "num_input_tokens_seen": 15492872, + "step": 23730 + }, + { + "epoch": 13.994693396226415, + "grad_norm": 3.462620496749878, + "learning_rate": 2.504457394536235e-06, + "loss": 0.3308, + "num_input_tokens_seen": 15495496, + "step": 23735 + }, + { + "epoch": 13.997641509433961, + "grad_norm": 2.9258315563201904, + "learning_rate": 2.502228366620216e-06, + "loss": 0.2964, + "num_input_tokens_seen": 15498760, + "step": 23740 + }, + { + "epoch": 14.0, + "eval_loss": 0.5794597268104553, + "eval_runtime": 19.2741, + "eval_samples_per_second": 87.994, + "eval_steps_per_second": 21.998, + "num_input_tokens_seen": 15500632, + "step": 23744 + }, + { + "epoch": 14.00058962264151, + "grad_norm": 2.4230945110321045, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.2296, + "num_input_tokens_seen": 15501336, + "step": 23745 + }, + { + "epoch": 14.003537735849056, + "grad_norm": 3.0880932807922363, + "learning_rate": 2.497772295265561e-06, + "loss": 0.4409, + "num_input_tokens_seen": 15504728, + "step": 23750 + }, + { + "epoch": 14.006485849056604, + "grad_norm": 4.411482810974121, + "learning_rate": 2.4955452530066897e-06, + "loss": 0.5093, + "num_input_tokens_seen": 15508280, + "step": 23755 + }, + { + "epoch": 14.00943396226415, + "grad_norm": 3.2562484741210938, + "learning_rate": 2.4933188738130043e-06, + "loss": 0.4395, + "num_input_tokens_seen": 15511224, + "step": 23760 + }, + { + "epoch": 14.012382075471699, + "grad_norm": 3.3255534172058105, + "learning_rate": 2.49109315827395e-06, + "loss": 0.2503, + "num_input_tokens_seen": 15514008, + "step": 23765 + }, + { + "epoch": 14.015330188679245, + "grad_norm": 1.890016794204712, + "learning_rate": 2.4888681069787975e-06, + "loss": 0.2367, + "num_input_tokens_seen": 15517016, + "step": 23770 + }, + { + "epoch": 14.018278301886792, + "grad_norm": 4.630746841430664, + "learning_rate": 2.4866437205166353e-06, + "loss": 0.2587, + "num_input_tokens_seen": 15520632, + "step": 23775 + }, + { + "epoch": 14.02122641509434, + "grad_norm": 3.211172342300415, + "learning_rate": 2.4844199994763803e-06, + "loss": 0.262, + "num_input_tokens_seen": 15523832, + "step": 23780 + }, + { + "epoch": 14.024174528301886, + "grad_norm": 3.8708865642547607, + "learning_rate": 2.482196944446772e-06, + "loss": 0.3154, + "num_input_tokens_seen": 15526776, + "step": 23785 + }, + { + "epoch": 14.027122641509434, + "grad_norm": 4.527602195739746, + "learning_rate": 2.4799745560163736e-06, + "loss": 0.3213, + "num_input_tokens_seen": 15530328, + "step": 23790 + }, + { + "epoch": 14.03007075471698, + "grad_norm": 2.807539463043213, + "learning_rate": 2.4777528347735707e-06, + "loss": 0.3617, + "num_input_tokens_seen": 15533624, + "step": 23795 + }, + { + "epoch": 14.033018867924529, + "grad_norm": 3.2349693775177, + "learning_rate": 2.4755317813065766e-06, + "loss": 0.3075, + "num_input_tokens_seen": 15536696, + "step": 23800 + }, + { + "epoch": 14.035966981132075, + "grad_norm": 4.578696250915527, + "learning_rate": 2.4733113962034234e-06, + "loss": 0.3541, + "num_input_tokens_seen": 15539640, + "step": 23805 + }, + { + "epoch": 14.038915094339623, + "grad_norm": 3.1331701278686523, + "learning_rate": 2.4710916800519674e-06, + "loss": 0.314, + "num_input_tokens_seen": 15542648, + "step": 23810 + }, + { + "epoch": 14.04186320754717, + "grad_norm": 4.700671672821045, + "learning_rate": 2.4688726334398883e-06, + "loss": 0.3291, + "num_input_tokens_seen": 15545912, + "step": 23815 + }, + { + "epoch": 14.044811320754716, + "grad_norm": 4.492247581481934, + "learning_rate": 2.466654256954688e-06, + "loss": 0.3219, + "num_input_tokens_seen": 15549016, + "step": 23820 + }, + { + "epoch": 14.047759433962264, + "grad_norm": 2.913160562515259, + "learning_rate": 2.4644365511836895e-06, + "loss": 0.2469, + "num_input_tokens_seen": 15552216, + "step": 23825 + }, + { + "epoch": 14.05070754716981, + "grad_norm": 3.5613784790039062, + "learning_rate": 2.4622195167140432e-06, + "loss": 0.3636, + "num_input_tokens_seen": 15555640, + "step": 23830 + }, + { + "epoch": 14.053655660377359, + "grad_norm": 4.012432098388672, + "learning_rate": 2.4600031541327173e-06, + "loss": 0.3689, + "num_input_tokens_seen": 15558424, + "step": 23835 + }, + { + "epoch": 14.056603773584905, + "grad_norm": 3.5178496837615967, + "learning_rate": 2.457787464026503e-06, + "loss": 0.2814, + "num_input_tokens_seen": 15561112, + "step": 23840 + }, + { + "epoch": 14.059551886792454, + "grad_norm": 2.2287051677703857, + "learning_rate": 2.455572446982014e-06, + "loss": 0.2921, + "num_input_tokens_seen": 15564376, + "step": 23845 + }, + { + "epoch": 14.0625, + "grad_norm": 3.907405376434326, + "learning_rate": 2.453358103585686e-06, + "loss": 0.3222, + "num_input_tokens_seen": 15567928, + "step": 23850 + }, + { + "epoch": 14.065448113207546, + "grad_norm": 4.563554286956787, + "learning_rate": 2.4511444344237733e-06, + "loss": 0.4102, + "num_input_tokens_seen": 15574104, + "step": 23855 + }, + { + "epoch": 14.068396226415095, + "grad_norm": 3.8091859817504883, + "learning_rate": 2.4489314400823567e-06, + "loss": 0.4085, + "num_input_tokens_seen": 15577592, + "step": 23860 + }, + { + "epoch": 14.071344339622641, + "grad_norm": 2.272507429122925, + "learning_rate": 2.446719121147337e-06, + "loss": 0.3375, + "num_input_tokens_seen": 15580600, + "step": 23865 + }, + { + "epoch": 14.07429245283019, + "grad_norm": 2.571584701538086, + "learning_rate": 2.4445074782044347e-06, + "loss": 0.2539, + "num_input_tokens_seen": 15583224, + "step": 23870 + }, + { + "epoch": 14.077240566037736, + "grad_norm": 3.227062463760376, + "learning_rate": 2.442296511839191e-06, + "loss": 0.3206, + "num_input_tokens_seen": 15586264, + "step": 23875 + }, + { + "epoch": 14.080188679245284, + "grad_norm": 2.988308906555176, + "learning_rate": 2.4400862226369687e-06, + "loss": 0.4368, + "num_input_tokens_seen": 15589528, + "step": 23880 + }, + { + "epoch": 14.08313679245283, + "grad_norm": 4.667728424072266, + "learning_rate": 2.4378766111829514e-06, + "loss": 0.3599, + "num_input_tokens_seen": 15592216, + "step": 23885 + }, + { + "epoch": 14.086084905660377, + "grad_norm": 4.95134162902832, + "learning_rate": 2.435667678062142e-06, + "loss": 0.3327, + "num_input_tokens_seen": 15595672, + "step": 23890 + }, + { + "epoch": 14.089033018867925, + "grad_norm": 3.828526258468628, + "learning_rate": 2.4334594238593682e-06, + "loss": 0.3362, + "num_input_tokens_seen": 15599768, + "step": 23895 + }, + { + "epoch": 14.091981132075471, + "grad_norm": 5.341838359832764, + "learning_rate": 2.4312518491592727e-06, + "loss": 0.3027, + "num_input_tokens_seen": 15602872, + "step": 23900 + }, + { + "epoch": 14.09492924528302, + "grad_norm": 6.476365089416504, + "learning_rate": 2.429044954546322e-06, + "loss": 0.3182, + "num_input_tokens_seen": 15606936, + "step": 23905 + }, + { + "epoch": 14.097877358490566, + "grad_norm": 3.139141321182251, + "learning_rate": 2.426838740604799e-06, + "loss": 0.2793, + "num_input_tokens_seen": 15609752, + "step": 23910 + }, + { + "epoch": 14.100825471698114, + "grad_norm": 4.3305816650390625, + "learning_rate": 2.4246332079188066e-06, + "loss": 0.2909, + "num_input_tokens_seen": 15613720, + "step": 23915 + }, + { + "epoch": 14.10377358490566, + "grad_norm": 4.007850170135498, + "learning_rate": 2.4224283570722745e-06, + "loss": 0.3562, + "num_input_tokens_seen": 15616632, + "step": 23920 + }, + { + "epoch": 14.106721698113208, + "grad_norm": 3.338759422302246, + "learning_rate": 2.420224188648943e-06, + "loss": 0.3721, + "num_input_tokens_seen": 15620312, + "step": 23925 + }, + { + "epoch": 14.109669811320755, + "grad_norm": 3.2355918884277344, + "learning_rate": 2.418020703232376e-06, + "loss": 0.2583, + "num_input_tokens_seen": 15623736, + "step": 23930 + }, + { + "epoch": 14.112617924528301, + "grad_norm": 5.2772111892700195, + "learning_rate": 2.4158179014059556e-06, + "loss": 0.3596, + "num_input_tokens_seen": 15626936, + "step": 23935 + }, + { + "epoch": 14.11556603773585, + "grad_norm": 5.276137351989746, + "learning_rate": 2.413615783752883e-06, + "loss": 0.3877, + "num_input_tokens_seen": 15630040, + "step": 23940 + }, + { + "epoch": 14.118514150943396, + "grad_norm": 6.82316780090332, + "learning_rate": 2.4114143508561767e-06, + "loss": 0.3231, + "num_input_tokens_seen": 15633336, + "step": 23945 + }, + { + "epoch": 14.121462264150944, + "grad_norm": 2.8025989532470703, + "learning_rate": 2.4092136032986783e-06, + "loss": 0.3425, + "num_input_tokens_seen": 15636792, + "step": 23950 + }, + { + "epoch": 14.12441037735849, + "grad_norm": 3.428359270095825, + "learning_rate": 2.407013541663043e-06, + "loss": 0.291, + "num_input_tokens_seen": 15640696, + "step": 23955 + }, + { + "epoch": 14.127358490566039, + "grad_norm": 3.9980931282043457, + "learning_rate": 2.4048141665317493e-06, + "loss": 0.3281, + "num_input_tokens_seen": 15644312, + "step": 23960 + }, + { + "epoch": 14.130306603773585, + "grad_norm": 2.9106061458587646, + "learning_rate": 2.40261547848709e-06, + "loss": 0.3335, + "num_input_tokens_seen": 15646840, + "step": 23965 + }, + { + "epoch": 14.133254716981131, + "grad_norm": 2.877027988433838, + "learning_rate": 2.400417478111176e-06, + "loss": 0.2666, + "num_input_tokens_seen": 15649464, + "step": 23970 + }, + { + "epoch": 14.13620283018868, + "grad_norm": 4.340320587158203, + "learning_rate": 2.3982201659859387e-06, + "loss": 0.2872, + "num_input_tokens_seen": 15653432, + "step": 23975 + }, + { + "epoch": 14.139150943396226, + "grad_norm": 4.135821342468262, + "learning_rate": 2.3960235426931237e-06, + "loss": 0.4049, + "num_input_tokens_seen": 15657112, + "step": 23980 + }, + { + "epoch": 14.142099056603774, + "grad_norm": 3.5019288063049316, + "learning_rate": 2.3938276088143003e-06, + "loss": 0.3375, + "num_input_tokens_seen": 15660632, + "step": 23985 + }, + { + "epoch": 14.14504716981132, + "grad_norm": 4.287090301513672, + "learning_rate": 2.391632364930849e-06, + "loss": 0.4826, + "num_input_tokens_seen": 15663064, + "step": 23990 + }, + { + "epoch": 14.147995283018869, + "grad_norm": 2.103794813156128, + "learning_rate": 2.3894378116239706e-06, + "loss": 0.357, + "num_input_tokens_seen": 15666168, + "step": 23995 + }, + { + "epoch": 14.150943396226415, + "grad_norm": 4.531780242919922, + "learning_rate": 2.387243949474683e-06, + "loss": 0.5006, + "num_input_tokens_seen": 15669528, + "step": 24000 + }, + { + "epoch": 14.153891509433961, + "grad_norm": 3.19777512550354, + "learning_rate": 2.38505077906382e-06, + "loss": 0.2869, + "num_input_tokens_seen": 15673016, + "step": 24005 + }, + { + "epoch": 14.15683962264151, + "grad_norm": 2.9102702140808105, + "learning_rate": 2.382858300972031e-06, + "loss": 0.2432, + "num_input_tokens_seen": 15676696, + "step": 24010 + }, + { + "epoch": 14.159787735849056, + "grad_norm": 2.849848747253418, + "learning_rate": 2.380666515779788e-06, + "loss": 0.4, + "num_input_tokens_seen": 15680600, + "step": 24015 + }, + { + "epoch": 14.162735849056604, + "grad_norm": 2.669362783432007, + "learning_rate": 2.3784754240673734e-06, + "loss": 0.2617, + "num_input_tokens_seen": 15683800, + "step": 24020 + }, + { + "epoch": 14.16568396226415, + "grad_norm": 3.906259059906006, + "learning_rate": 2.3762850264148883e-06, + "loss": 0.3724, + "num_input_tokens_seen": 15686616, + "step": 24025 + }, + { + "epoch": 14.168632075471699, + "grad_norm": 2.4967591762542725, + "learning_rate": 2.374095323402251e-06, + "loss": 0.2792, + "num_input_tokens_seen": 15690872, + "step": 24030 + }, + { + "epoch": 14.171580188679245, + "grad_norm": 5.160085678100586, + "learning_rate": 2.371906315609193e-06, + "loss": 0.3147, + "num_input_tokens_seen": 15694104, + "step": 24035 + }, + { + "epoch": 14.174528301886792, + "grad_norm": 3.1551268100738525, + "learning_rate": 2.369718003615263e-06, + "loss": 0.4198, + "num_input_tokens_seen": 15697496, + "step": 24040 + }, + { + "epoch": 14.17747641509434, + "grad_norm": 4.523599624633789, + "learning_rate": 2.3675303879998284e-06, + "loss": 0.3282, + "num_input_tokens_seen": 15701016, + "step": 24045 + }, + { + "epoch": 14.180424528301886, + "grad_norm": 6.506444454193115, + "learning_rate": 2.365343469342068e-06, + "loss": 0.4184, + "num_input_tokens_seen": 15703512, + "step": 24050 + }, + { + "epoch": 14.183372641509434, + "grad_norm": 3.0132546424865723, + "learning_rate": 2.3631572482209803e-06, + "loss": 0.2672, + "num_input_tokens_seen": 15706936, + "step": 24055 + }, + { + "epoch": 14.18632075471698, + "grad_norm": 3.9569203853607178, + "learning_rate": 2.3609717252153752e-06, + "loss": 0.3647, + "num_input_tokens_seen": 15709560, + "step": 24060 + }, + { + "epoch": 14.189268867924529, + "grad_norm": 2.8083577156066895, + "learning_rate": 2.35878690090388e-06, + "loss": 0.3087, + "num_input_tokens_seen": 15712632, + "step": 24065 + }, + { + "epoch": 14.192216981132075, + "grad_norm": 2.3957791328430176, + "learning_rate": 2.356602775864935e-06, + "loss": 0.3228, + "num_input_tokens_seen": 15715800, + "step": 24070 + }, + { + "epoch": 14.195165094339623, + "grad_norm": 2.199737310409546, + "learning_rate": 2.354419350676796e-06, + "loss": 0.348, + "num_input_tokens_seen": 15718936, + "step": 24075 + }, + { + "epoch": 14.19811320754717, + "grad_norm": 2.7082836627960205, + "learning_rate": 2.3522366259175377e-06, + "loss": 0.2849, + "num_input_tokens_seen": 15722072, + "step": 24080 + }, + { + "epoch": 14.201061320754716, + "grad_norm": 2.5783817768096924, + "learning_rate": 2.350054602165044e-06, + "loss": 0.3787, + "num_input_tokens_seen": 15726008, + "step": 24085 + }, + { + "epoch": 14.204009433962264, + "grad_norm": 3.0265262126922607, + "learning_rate": 2.3478732799970143e-06, + "loss": 0.2859, + "num_input_tokens_seen": 15728504, + "step": 24090 + }, + { + "epoch": 14.20695754716981, + "grad_norm": 5.609025955200195, + "learning_rate": 2.3456926599909646e-06, + "loss": 0.381, + "num_input_tokens_seen": 15731768, + "step": 24095 + }, + { + "epoch": 14.209905660377359, + "grad_norm": 4.55352783203125, + "learning_rate": 2.343512742724222e-06, + "loss": 0.3961, + "num_input_tokens_seen": 15735416, + "step": 24100 + }, + { + "epoch": 14.212853773584905, + "grad_norm": 4.563655376434326, + "learning_rate": 2.341333528773928e-06, + "loss": 0.3129, + "num_input_tokens_seen": 15739896, + "step": 24105 + }, + { + "epoch": 14.215801886792454, + "grad_norm": 4.843811988830566, + "learning_rate": 2.3391550187170427e-06, + "loss": 0.3178, + "num_input_tokens_seen": 15742712, + "step": 24110 + }, + { + "epoch": 14.21875, + "grad_norm": 6.251970291137695, + "learning_rate": 2.336977213130333e-06, + "loss": 0.3548, + "num_input_tokens_seen": 15745528, + "step": 24115 + }, + { + "epoch": 14.221698113207546, + "grad_norm": 5.0519304275512695, + "learning_rate": 2.3348001125903837e-06, + "loss": 0.4504, + "num_input_tokens_seen": 15748536, + "step": 24120 + }, + { + "epoch": 14.224646226415095, + "grad_norm": 2.9172656536102295, + "learning_rate": 2.3326237176735905e-06, + "loss": 0.2631, + "num_input_tokens_seen": 15751000, + "step": 24125 + }, + { + "epoch": 14.227594339622641, + "grad_norm": 3.3330605030059814, + "learning_rate": 2.330448028956164e-06, + "loss": 0.2291, + "num_input_tokens_seen": 15754904, + "step": 24130 + }, + { + "epoch": 14.23054245283019, + "grad_norm": 3.406487464904785, + "learning_rate": 2.3282730470141255e-06, + "loss": 0.3708, + "num_input_tokens_seen": 15757912, + "step": 24135 + }, + { + "epoch": 14.233490566037736, + "grad_norm": 5.141595363616943, + "learning_rate": 2.3260987724233143e-06, + "loss": 0.4114, + "num_input_tokens_seen": 15760792, + "step": 24140 + }, + { + "epoch": 14.236438679245284, + "grad_norm": 2.2629570960998535, + "learning_rate": 2.323925205759374e-06, + "loss": 0.3103, + "num_input_tokens_seen": 15763704, + "step": 24145 + }, + { + "epoch": 14.23938679245283, + "grad_norm": 1.752752423286438, + "learning_rate": 2.3217523475977715e-06, + "loss": 0.3204, + "num_input_tokens_seen": 15767128, + "step": 24150 + }, + { + "epoch": 14.242334905660377, + "grad_norm": 4.218230247497559, + "learning_rate": 2.3195801985137773e-06, + "loss": 0.4223, + "num_input_tokens_seen": 15770904, + "step": 24155 + }, + { + "epoch": 14.245283018867925, + "grad_norm": 3.371896505355835, + "learning_rate": 2.317408759082478e-06, + "loss": 0.2677, + "num_input_tokens_seen": 15773528, + "step": 24160 + }, + { + "epoch": 14.248231132075471, + "grad_norm": 3.2545254230499268, + "learning_rate": 2.31523802987877e-06, + "loss": 0.2553, + "num_input_tokens_seen": 15776312, + "step": 24165 + }, + { + "epoch": 14.25117924528302, + "grad_norm": 3.053255319595337, + "learning_rate": 2.3130680114773637e-06, + "loss": 0.3026, + "num_input_tokens_seen": 15778680, + "step": 24170 + }, + { + "epoch": 14.254127358490566, + "grad_norm": 4.017597675323486, + "learning_rate": 2.310898704452782e-06, + "loss": 0.3387, + "num_input_tokens_seen": 15782072, + "step": 24175 + }, + { + "epoch": 14.257075471698114, + "grad_norm": 3.984003782272339, + "learning_rate": 2.3087301093793584e-06, + "loss": 0.2779, + "num_input_tokens_seen": 15784856, + "step": 24180 + }, + { + "epoch": 14.26002358490566, + "grad_norm": 3.8263967037200928, + "learning_rate": 2.306562226831237e-06, + "loss": 0.3186, + "num_input_tokens_seen": 15788024, + "step": 24185 + }, + { + "epoch": 14.262971698113208, + "grad_norm": 3.95196795463562, + "learning_rate": 2.304395057382374e-06, + "loss": 0.2294, + "num_input_tokens_seen": 15790648, + "step": 24190 + }, + { + "epoch": 14.265919811320755, + "grad_norm": 2.1403892040252686, + "learning_rate": 2.3022286016065354e-06, + "loss": 0.3453, + "num_input_tokens_seen": 15793656, + "step": 24195 + }, + { + "epoch": 14.268867924528301, + "grad_norm": 2.7065579891204834, + "learning_rate": 2.300062860077303e-06, + "loss": 0.2681, + "num_input_tokens_seen": 15796248, + "step": 24200 + }, + { + "epoch": 14.27181603773585, + "grad_norm": 3.412202835083008, + "learning_rate": 2.297897833368064e-06, + "loss": 0.3628, + "num_input_tokens_seen": 15799192, + "step": 24205 + }, + { + "epoch": 14.274764150943396, + "grad_norm": 3.5632128715515137, + "learning_rate": 2.2957335220520194e-06, + "loss": 0.3291, + "num_input_tokens_seen": 15801496, + "step": 24210 + }, + { + "epoch": 14.277712264150944, + "grad_norm": 5.4536848068237305, + "learning_rate": 2.293569926702179e-06, + "loss": 0.2567, + "num_input_tokens_seen": 15806520, + "step": 24215 + }, + { + "epoch": 14.28066037735849, + "grad_norm": 2.7862112522125244, + "learning_rate": 2.291407047891366e-06, + "loss": 0.3222, + "num_input_tokens_seen": 15810584, + "step": 24220 + }, + { + "epoch": 14.283608490566039, + "grad_norm": 2.908254384994507, + "learning_rate": 2.2892448861922075e-06, + "loss": 0.3436, + "num_input_tokens_seen": 15814072, + "step": 24225 + }, + { + "epoch": 14.286556603773585, + "grad_norm": 5.162661552429199, + "learning_rate": 2.2870834421771505e-06, + "loss": 0.3707, + "num_input_tokens_seen": 15816984, + "step": 24230 + }, + { + "epoch": 14.289504716981131, + "grad_norm": 2.474459171295166, + "learning_rate": 2.2849227164184433e-06, + "loss": 0.313, + "num_input_tokens_seen": 15820280, + "step": 24235 + }, + { + "epoch": 14.29245283018868, + "grad_norm": 3.512907028198242, + "learning_rate": 2.2827627094881473e-06, + "loss": 0.4984, + "num_input_tokens_seen": 15826072, + "step": 24240 + }, + { + "epoch": 14.295400943396226, + "grad_norm": 3.002999782562256, + "learning_rate": 2.2806034219581364e-06, + "loss": 0.2882, + "num_input_tokens_seen": 15831416, + "step": 24245 + }, + { + "epoch": 14.298349056603774, + "grad_norm": 3.2839317321777344, + "learning_rate": 2.278444854400089e-06, + "loss": 0.4109, + "num_input_tokens_seen": 15834200, + "step": 24250 + }, + { + "epoch": 14.30129716981132, + "grad_norm": 3.258768081665039, + "learning_rate": 2.276287007385496e-06, + "loss": 0.3489, + "num_input_tokens_seen": 15837016, + "step": 24255 + }, + { + "epoch": 14.304245283018869, + "grad_norm": 3.3484063148498535, + "learning_rate": 2.2741298814856542e-06, + "loss": 0.2863, + "num_input_tokens_seen": 15840760, + "step": 24260 + }, + { + "epoch": 14.307193396226415, + "grad_norm": 3.0318691730499268, + "learning_rate": 2.2719734772716763e-06, + "loss": 0.2631, + "num_input_tokens_seen": 15844184, + "step": 24265 + }, + { + "epoch": 14.310141509433961, + "grad_norm": 2.8463633060455322, + "learning_rate": 2.269817795314477e-06, + "loss": 0.3773, + "num_input_tokens_seen": 15848760, + "step": 24270 + }, + { + "epoch": 14.31308962264151, + "grad_norm": 3.2096729278564453, + "learning_rate": 2.2676628361847834e-06, + "loss": 0.2671, + "num_input_tokens_seen": 15851960, + "step": 24275 + }, + { + "epoch": 14.316037735849056, + "grad_norm": 4.803024768829346, + "learning_rate": 2.2655086004531296e-06, + "loss": 0.3662, + "num_input_tokens_seen": 15855960, + "step": 24280 + }, + { + "epoch": 14.318985849056604, + "grad_norm": 3.032406806945801, + "learning_rate": 2.2633550886898583e-06, + "loss": 0.3505, + "num_input_tokens_seen": 15858744, + "step": 24285 + }, + { + "epoch": 14.32193396226415, + "grad_norm": 2.8560009002685547, + "learning_rate": 2.26120230146512e-06, + "loss": 0.3486, + "num_input_tokens_seen": 15861752, + "step": 24290 + }, + { + "epoch": 14.324882075471699, + "grad_norm": 3.820908784866333, + "learning_rate": 2.2590502393488777e-06, + "loss": 0.3813, + "num_input_tokens_seen": 15865784, + "step": 24295 + }, + { + "epoch": 14.327830188679245, + "grad_norm": 4.50192403793335, + "learning_rate": 2.256898902910898e-06, + "loss": 0.3559, + "num_input_tokens_seen": 15868280, + "step": 24300 + }, + { + "epoch": 14.330778301886792, + "grad_norm": 4.109908580780029, + "learning_rate": 2.2547482927207548e-06, + "loss": 0.3393, + "num_input_tokens_seen": 15871672, + "step": 24305 + }, + { + "epoch": 14.33372641509434, + "grad_norm": 2.7612059116363525, + "learning_rate": 2.252598409347833e-06, + "loss": 0.3198, + "num_input_tokens_seen": 15874840, + "step": 24310 + }, + { + "epoch": 14.336674528301886, + "grad_norm": 2.621295213699341, + "learning_rate": 2.250449253361323e-06, + "loss": 0.3886, + "num_input_tokens_seen": 15878264, + "step": 24315 + }, + { + "epoch": 14.339622641509434, + "grad_norm": 3.8720176219940186, + "learning_rate": 2.2483008253302214e-06, + "loss": 0.3586, + "num_input_tokens_seen": 15881528, + "step": 24320 + }, + { + "epoch": 14.34257075471698, + "grad_norm": 4.059556484222412, + "learning_rate": 2.246153125823337e-06, + "loss": 0.3412, + "num_input_tokens_seen": 15885304, + "step": 24325 + }, + { + "epoch": 14.345518867924529, + "grad_norm": 3.831134796142578, + "learning_rate": 2.2440061554092813e-06, + "loss": 0.4714, + "num_input_tokens_seen": 15888408, + "step": 24330 + }, + { + "epoch": 14.348466981132075, + "grad_norm": 3.310366153717041, + "learning_rate": 2.2418599146564714e-06, + "loss": 0.2547, + "num_input_tokens_seen": 15892280, + "step": 24335 + }, + { + "epoch": 14.351415094339623, + "grad_norm": 5.969259262084961, + "learning_rate": 2.239714404133138e-06, + "loss": 0.2886, + "num_input_tokens_seen": 15895064, + "step": 24340 + }, + { + "epoch": 14.35436320754717, + "grad_norm": 4.251812934875488, + "learning_rate": 2.2375696244073126e-06, + "loss": 0.3512, + "num_input_tokens_seen": 15898232, + "step": 24345 + }, + { + "epoch": 14.357311320754716, + "grad_norm": 3.0990238189697266, + "learning_rate": 2.235425576046834e-06, + "loss": 0.3115, + "num_input_tokens_seen": 15900472, + "step": 24350 + }, + { + "epoch": 14.360259433962264, + "grad_norm": 3.015045642852783, + "learning_rate": 2.233282259619347e-06, + "loss": 0.33, + "num_input_tokens_seen": 15902808, + "step": 24355 + }, + { + "epoch": 14.36320754716981, + "grad_norm": 2.810035228729248, + "learning_rate": 2.231139675692308e-06, + "loss": 0.3507, + "num_input_tokens_seen": 15906136, + "step": 24360 + }, + { + "epoch": 14.366155660377359, + "grad_norm": 3.0610291957855225, + "learning_rate": 2.228997824832973e-06, + "loss": 0.372, + "num_input_tokens_seen": 15912728, + "step": 24365 + }, + { + "epoch": 14.369103773584905, + "grad_norm": 2.4471137523651123, + "learning_rate": 2.226856707608406e-06, + "loss": 0.2313, + "num_input_tokens_seen": 15917016, + "step": 24370 + }, + { + "epoch": 14.372051886792454, + "grad_norm": 4.8992486000061035, + "learning_rate": 2.2247163245854768e-06, + "loss": 0.3072, + "num_input_tokens_seen": 15919128, + "step": 24375 + }, + { + "epoch": 14.375, + "grad_norm": 3.533554792404175, + "learning_rate": 2.222576676330862e-06, + "loss": 0.3135, + "num_input_tokens_seen": 15921752, + "step": 24380 + }, + { + "epoch": 14.377948113207546, + "grad_norm": 3.7241976261138916, + "learning_rate": 2.2204377634110403e-06, + "loss": 0.3074, + "num_input_tokens_seen": 15925784, + "step": 24385 + }, + { + "epoch": 14.380896226415095, + "grad_norm": 3.2024948596954346, + "learning_rate": 2.218299586392301e-06, + "loss": 0.3555, + "num_input_tokens_seen": 15928632, + "step": 24390 + }, + { + "epoch": 14.383844339622641, + "grad_norm": 1.9672027826309204, + "learning_rate": 2.2161621458407355e-06, + "loss": 0.3114, + "num_input_tokens_seen": 15932216, + "step": 24395 + }, + { + "epoch": 14.38679245283019, + "grad_norm": 2.917719602584839, + "learning_rate": 2.2140254423222398e-06, + "loss": 0.3, + "num_input_tokens_seen": 15935800, + "step": 24400 + }, + { + "epoch": 14.389740566037736, + "grad_norm": 3.368609666824341, + "learning_rate": 2.2118894764025146e-06, + "loss": 0.2226, + "num_input_tokens_seen": 15938520, + "step": 24405 + }, + { + "epoch": 14.392688679245284, + "grad_norm": 2.9019968509674072, + "learning_rate": 2.2097542486470667e-06, + "loss": 0.3389, + "num_input_tokens_seen": 15941752, + "step": 24410 + }, + { + "epoch": 14.39563679245283, + "grad_norm": 3.6766483783721924, + "learning_rate": 2.207619759621205e-06, + "loss": 0.3444, + "num_input_tokens_seen": 15945272, + "step": 24415 + }, + { + "epoch": 14.398584905660377, + "grad_norm": 3.259132146835327, + "learning_rate": 2.205486009890049e-06, + "loss": 0.3377, + "num_input_tokens_seen": 15948952, + "step": 24420 + }, + { + "epoch": 14.401533018867925, + "grad_norm": 4.039430618286133, + "learning_rate": 2.2033530000185146e-06, + "loss": 0.2825, + "num_input_tokens_seen": 15954680, + "step": 24425 + }, + { + "epoch": 14.404481132075471, + "grad_norm": 3.5588290691375732, + "learning_rate": 2.2012207305713244e-06, + "loss": 0.3256, + "num_input_tokens_seen": 15956984, + "step": 24430 + }, + { + "epoch": 14.40742924528302, + "grad_norm": 4.120483875274658, + "learning_rate": 2.19908920211301e-06, + "loss": 0.3601, + "num_input_tokens_seen": 15959960, + "step": 24435 + }, + { + "epoch": 14.410377358490566, + "grad_norm": 2.310478448867798, + "learning_rate": 2.196958415207901e-06, + "loss": 0.2407, + "num_input_tokens_seen": 15963416, + "step": 24440 + }, + { + "epoch": 14.413325471698114, + "grad_norm": 4.675232410430908, + "learning_rate": 2.1948283704201312e-06, + "loss": 0.3438, + "num_input_tokens_seen": 15966488, + "step": 24445 + }, + { + "epoch": 14.41627358490566, + "grad_norm": 2.6562023162841797, + "learning_rate": 2.1926990683136383e-06, + "loss": 0.3504, + "num_input_tokens_seen": 15969784, + "step": 24450 + }, + { + "epoch": 14.419221698113208, + "grad_norm": 3.6201164722442627, + "learning_rate": 2.1905705094521685e-06, + "loss": 0.3158, + "num_input_tokens_seen": 15972696, + "step": 24455 + }, + { + "epoch": 14.422169811320755, + "grad_norm": 2.400818109512329, + "learning_rate": 2.1884426943992635e-06, + "loss": 0.3193, + "num_input_tokens_seen": 15975832, + "step": 24460 + }, + { + "epoch": 14.425117924528301, + "grad_norm": 3.1438746452331543, + "learning_rate": 2.1863156237182727e-06, + "loss": 0.3011, + "num_input_tokens_seen": 15978936, + "step": 24465 + }, + { + "epoch": 14.42806603773585, + "grad_norm": 3.6834146976470947, + "learning_rate": 2.1841892979723466e-06, + "loss": 0.2822, + "num_input_tokens_seen": 15981752, + "step": 24470 + }, + { + "epoch": 14.431014150943396, + "grad_norm": 5.897389888763428, + "learning_rate": 2.1820637177244375e-06, + "loss": 0.3454, + "num_input_tokens_seen": 15984536, + "step": 24475 + }, + { + "epoch": 14.433962264150944, + "grad_norm": 2.9747421741485596, + "learning_rate": 2.179938883537306e-06, + "loss": 0.2998, + "num_input_tokens_seen": 15987288, + "step": 24480 + }, + { + "epoch": 14.43691037735849, + "grad_norm": 4.500246047973633, + "learning_rate": 2.177814795973508e-06, + "loss": 0.3096, + "num_input_tokens_seen": 15990840, + "step": 24485 + }, + { + "epoch": 14.439858490566039, + "grad_norm": 4.845057487487793, + "learning_rate": 2.1756914555954064e-06, + "loss": 0.2668, + "num_input_tokens_seen": 15994424, + "step": 24490 + }, + { + "epoch": 14.442806603773585, + "grad_norm": 1.8264859914779663, + "learning_rate": 2.173568862965164e-06, + "loss": 0.2608, + "num_input_tokens_seen": 15997880, + "step": 24495 + }, + { + "epoch": 14.445754716981131, + "grad_norm": 7.522373199462891, + "learning_rate": 2.171447018644746e-06, + "loss": 0.5477, + "num_input_tokens_seen": 16000472, + "step": 24500 + }, + { + "epoch": 14.44870283018868, + "grad_norm": 4.088736057281494, + "learning_rate": 2.1693259231959186e-06, + "loss": 0.3466, + "num_input_tokens_seen": 16003512, + "step": 24505 + }, + { + "epoch": 14.451650943396226, + "grad_norm": 6.272796630859375, + "learning_rate": 2.1672055771802545e-06, + "loss": 0.3931, + "num_input_tokens_seen": 16006520, + "step": 24510 + }, + { + "epoch": 14.454599056603774, + "grad_norm": 3.148322343826294, + "learning_rate": 2.1650859811591224e-06, + "loss": 0.265, + "num_input_tokens_seen": 16009592, + "step": 24515 + }, + { + "epoch": 14.45754716981132, + "grad_norm": 3.959523916244507, + "learning_rate": 2.1629671356936943e-06, + "loss": 0.3259, + "num_input_tokens_seen": 16013656, + "step": 24520 + }, + { + "epoch": 14.460495283018869, + "grad_norm": 3.443258285522461, + "learning_rate": 2.1608490413449428e-06, + "loss": 0.2691, + "num_input_tokens_seen": 16016856, + "step": 24525 + }, + { + "epoch": 14.463443396226415, + "grad_norm": 4.163712501525879, + "learning_rate": 2.158731698673645e-06, + "loss": 0.3788, + "num_input_tokens_seen": 16019256, + "step": 24530 + }, + { + "epoch": 14.466391509433961, + "grad_norm": 3.3735201358795166, + "learning_rate": 2.1566151082403752e-06, + "loss": 0.2687, + "num_input_tokens_seen": 16022104, + "step": 24535 + }, + { + "epoch": 14.46933962264151, + "grad_norm": 4.417233943939209, + "learning_rate": 2.154499270605508e-06, + "loss": 0.2921, + "num_input_tokens_seen": 16025112, + "step": 24540 + }, + { + "epoch": 14.472287735849056, + "grad_norm": 6.360416412353516, + "learning_rate": 2.1523841863292243e-06, + "loss": 0.3586, + "num_input_tokens_seen": 16028088, + "step": 24545 + }, + { + "epoch": 14.475235849056604, + "grad_norm": 2.3085317611694336, + "learning_rate": 2.1502698559714998e-06, + "loss": 0.3377, + "num_input_tokens_seen": 16031640, + "step": 24550 + }, + { + "epoch": 14.47818396226415, + "grad_norm": 1.9191153049468994, + "learning_rate": 2.1481562800921125e-06, + "loss": 0.3389, + "num_input_tokens_seen": 16035064, + "step": 24555 + }, + { + "epoch": 14.481132075471699, + "grad_norm": 5.830350875854492, + "learning_rate": 2.146043459250641e-06, + "loss": 0.3088, + "num_input_tokens_seen": 16037848, + "step": 24560 + }, + { + "epoch": 14.484080188679245, + "grad_norm": 4.165152549743652, + "learning_rate": 2.1439313940064634e-06, + "loss": 0.3072, + "num_input_tokens_seen": 16041432, + "step": 24565 + }, + { + "epoch": 14.487028301886792, + "grad_norm": 7.840921401977539, + "learning_rate": 2.141820084918756e-06, + "loss": 0.3589, + "num_input_tokens_seen": 16045656, + "step": 24570 + }, + { + "epoch": 14.48997641509434, + "grad_norm": 3.7904062271118164, + "learning_rate": 2.1397095325465013e-06, + "loss": 0.3401, + "num_input_tokens_seen": 16048760, + "step": 24575 + }, + { + "epoch": 14.492924528301886, + "grad_norm": 3.34272837638855, + "learning_rate": 2.1375997374484754e-06, + "loss": 0.3185, + "num_input_tokens_seen": 16052792, + "step": 24580 + }, + { + "epoch": 14.495872641509434, + "grad_norm": 4.443939208984375, + "learning_rate": 2.1354907001832546e-06, + "loss": 0.3174, + "num_input_tokens_seen": 16055544, + "step": 24585 + }, + { + "epoch": 14.49882075471698, + "grad_norm": 3.0289082527160645, + "learning_rate": 2.133382421309217e-06, + "loss": 0.2882, + "num_input_tokens_seen": 16058168, + "step": 24590 + }, + { + "epoch": 14.501768867924529, + "grad_norm": 3.923945188522339, + "learning_rate": 2.131274901384537e-06, + "loss": 0.3229, + "num_input_tokens_seen": 16062072, + "step": 24595 + }, + { + "epoch": 14.504716981132075, + "grad_norm": 3.5849239826202393, + "learning_rate": 2.1291681409671896e-06, + "loss": 0.3107, + "num_input_tokens_seen": 16064824, + "step": 24600 + }, + { + "epoch": 14.507665094339622, + "grad_norm": 17.40311050415039, + "learning_rate": 2.12706214061495e-06, + "loss": 0.302, + "num_input_tokens_seen": 16067320, + "step": 24605 + }, + { + "epoch": 14.51061320754717, + "grad_norm": 4.50157356262207, + "learning_rate": 2.124956900885391e-06, + "loss": 0.4105, + "num_input_tokens_seen": 16070008, + "step": 24610 + }, + { + "epoch": 14.513561320754716, + "grad_norm": 2.9883038997650146, + "learning_rate": 2.1228524223358833e-06, + "loss": 0.2902, + "num_input_tokens_seen": 16072504, + "step": 24615 + }, + { + "epoch": 14.516509433962264, + "grad_norm": 3.6989357471466064, + "learning_rate": 2.120748705523595e-06, + "loss": 0.3866, + "num_input_tokens_seen": 16075896, + "step": 24620 + }, + { + "epoch": 14.51945754716981, + "grad_norm": 4.705385684967041, + "learning_rate": 2.1186457510054976e-06, + "loss": 0.2418, + "num_input_tokens_seen": 16078296, + "step": 24625 + }, + { + "epoch": 14.522405660377359, + "grad_norm": 3.6937990188598633, + "learning_rate": 2.116543559338355e-06, + "loss": 0.3203, + "num_input_tokens_seen": 16080792, + "step": 24630 + }, + { + "epoch": 14.525353773584905, + "grad_norm": 3.274273633956909, + "learning_rate": 2.1144421310787305e-06, + "loss": 0.3166, + "num_input_tokens_seen": 16084152, + "step": 24635 + }, + { + "epoch": 14.528301886792454, + "grad_norm": 6.232609748840332, + "learning_rate": 2.11234146678299e-06, + "loss": 0.3757, + "num_input_tokens_seen": 16087192, + "step": 24640 + }, + { + "epoch": 14.53125, + "grad_norm": 4.3313889503479, + "learning_rate": 2.1102415670072907e-06, + "loss": 0.4327, + "num_input_tokens_seen": 16090392, + "step": 24645 + }, + { + "epoch": 14.534198113207546, + "grad_norm": 3.7667744159698486, + "learning_rate": 2.108142432307591e-06, + "loss": 0.4116, + "num_input_tokens_seen": 16094200, + "step": 24650 + }, + { + "epoch": 14.537146226415095, + "grad_norm": 2.474250078201294, + "learning_rate": 2.1060440632396456e-06, + "loss": 0.259, + "num_input_tokens_seen": 16098008, + "step": 24655 + }, + { + "epoch": 14.540094339622641, + "grad_norm": 3.3793652057647705, + "learning_rate": 2.103946460359007e-06, + "loss": 0.2354, + "num_input_tokens_seen": 16100664, + "step": 24660 + }, + { + "epoch": 14.54304245283019, + "grad_norm": 5.267772674560547, + "learning_rate": 2.101849624221022e-06, + "loss": 0.3667, + "num_input_tokens_seen": 16103384, + "step": 24665 + }, + { + "epoch": 14.545990566037736, + "grad_norm": 2.518512010574341, + "learning_rate": 2.0997535553808417e-06, + "loss": 0.2406, + "num_input_tokens_seen": 16106488, + "step": 24670 + }, + { + "epoch": 14.548938679245284, + "grad_norm": 2.0440850257873535, + "learning_rate": 2.0976582543934064e-06, + "loss": 0.2744, + "num_input_tokens_seen": 16110360, + "step": 24675 + }, + { + "epoch": 14.55188679245283, + "grad_norm": 3.3194093704223633, + "learning_rate": 2.0955637218134573e-06, + "loss": 0.2792, + "num_input_tokens_seen": 16113880, + "step": 24680 + }, + { + "epoch": 14.554834905660378, + "grad_norm": 2.589024543762207, + "learning_rate": 2.09346995819553e-06, + "loss": 0.2526, + "num_input_tokens_seen": 16117080, + "step": 24685 + }, + { + "epoch": 14.557783018867925, + "grad_norm": 5.041914939880371, + "learning_rate": 2.0913769640939553e-06, + "loss": 0.4293, + "num_input_tokens_seen": 16120440, + "step": 24690 + }, + { + "epoch": 14.560731132075471, + "grad_norm": 4.148814678192139, + "learning_rate": 2.0892847400628674e-06, + "loss": 0.4181, + "num_input_tokens_seen": 16123864, + "step": 24695 + }, + { + "epoch": 14.56367924528302, + "grad_norm": 6.190976619720459, + "learning_rate": 2.0871932866561885e-06, + "loss": 0.311, + "num_input_tokens_seen": 16127032, + "step": 24700 + }, + { + "epoch": 14.566627358490566, + "grad_norm": 2.834306478500366, + "learning_rate": 2.0851026044276405e-06, + "loss": 0.301, + "num_input_tokens_seen": 16129528, + "step": 24705 + }, + { + "epoch": 14.569575471698114, + "grad_norm": 6.893313884735107, + "learning_rate": 2.083012693930741e-06, + "loss": 0.3594, + "num_input_tokens_seen": 16131864, + "step": 24710 + }, + { + "epoch": 14.57252358490566, + "grad_norm": 2.034270763397217, + "learning_rate": 2.0809235557188e-06, + "loss": 0.2058, + "num_input_tokens_seen": 16135992, + "step": 24715 + }, + { + "epoch": 14.575471698113208, + "grad_norm": 2.03066086769104, + "learning_rate": 2.0788351903449307e-06, + "loss": 0.3389, + "num_input_tokens_seen": 16139032, + "step": 24720 + }, + { + "epoch": 14.578419811320755, + "grad_norm": 2.4219069480895996, + "learning_rate": 2.0767475983620317e-06, + "loss": 0.2686, + "num_input_tokens_seen": 16142712, + "step": 24725 + }, + { + "epoch": 14.581367924528301, + "grad_norm": 3.401991605758667, + "learning_rate": 2.074660780322806e-06, + "loss": 0.3082, + "num_input_tokens_seen": 16146616, + "step": 24730 + }, + { + "epoch": 14.58431603773585, + "grad_norm": 2.8044910430908203, + "learning_rate": 2.0725747367797473e-06, + "loss": 0.3198, + "num_input_tokens_seen": 16149560, + "step": 24735 + }, + { + "epoch": 14.587264150943396, + "grad_norm": 2.5029103755950928, + "learning_rate": 2.070489468285143e-06, + "loss": 0.3162, + "num_input_tokens_seen": 16153656, + "step": 24740 + }, + { + "epoch": 14.590212264150944, + "grad_norm": 3.0213730335235596, + "learning_rate": 2.068404975391077e-06, + "loss": 0.3303, + "num_input_tokens_seen": 16157464, + "step": 24745 + }, + { + "epoch": 14.59316037735849, + "grad_norm": 3.246367931365967, + "learning_rate": 2.0663212586494293e-06, + "loss": 0.372, + "num_input_tokens_seen": 16160824, + "step": 24750 + }, + { + "epoch": 14.596108490566039, + "grad_norm": 2.5751190185546875, + "learning_rate": 2.064238318611869e-06, + "loss": 0.2608, + "num_input_tokens_seen": 16167384, + "step": 24755 + }, + { + "epoch": 14.599056603773585, + "grad_norm": 1.9619578123092651, + "learning_rate": 2.0621561558298693e-06, + "loss": 0.2641, + "num_input_tokens_seen": 16171224, + "step": 24760 + }, + { + "epoch": 14.602004716981131, + "grad_norm": 3.241269588470459, + "learning_rate": 2.0600747708546877e-06, + "loss": 0.293, + "num_input_tokens_seen": 16174200, + "step": 24765 + }, + { + "epoch": 14.60495283018868, + "grad_norm": 2.0297579765319824, + "learning_rate": 2.0579941642373814e-06, + "loss": 0.2646, + "num_input_tokens_seen": 16177816, + "step": 24770 + }, + { + "epoch": 14.607900943396226, + "grad_norm": 3.2797138690948486, + "learning_rate": 2.0559143365287993e-06, + "loss": 0.4378, + "num_input_tokens_seen": 16180952, + "step": 24775 + }, + { + "epoch": 14.610849056603774, + "grad_norm": 3.4334628582000732, + "learning_rate": 2.0538352882795846e-06, + "loss": 0.3841, + "num_input_tokens_seen": 16184120, + "step": 24780 + }, + { + "epoch": 14.61379716981132, + "grad_norm": 5.2459211349487305, + "learning_rate": 2.051757020040173e-06, + "loss": 0.3101, + "num_input_tokens_seen": 16187000, + "step": 24785 + }, + { + "epoch": 14.616745283018869, + "grad_norm": 2.9022159576416016, + "learning_rate": 2.0496795323607983e-06, + "loss": 0.3104, + "num_input_tokens_seen": 16189720, + "step": 24790 + }, + { + "epoch": 14.619693396226415, + "grad_norm": 4.8147783279418945, + "learning_rate": 2.0476028257914825e-06, + "loss": 0.2769, + "num_input_tokens_seen": 16193112, + "step": 24795 + }, + { + "epoch": 14.622641509433961, + "grad_norm": 2.4804394245147705, + "learning_rate": 2.0455269008820433e-06, + "loss": 0.516, + "num_input_tokens_seen": 16197304, + "step": 24800 + }, + { + "epoch": 14.62558962264151, + "grad_norm": 3.421219825744629, + "learning_rate": 2.0434517581820893e-06, + "loss": 0.3636, + "num_input_tokens_seen": 16200728, + "step": 24805 + }, + { + "epoch": 14.628537735849056, + "grad_norm": 5.197613716125488, + "learning_rate": 2.041377398241025e-06, + "loss": 0.3165, + "num_input_tokens_seen": 16203448, + "step": 24810 + }, + { + "epoch": 14.631485849056604, + "grad_norm": 4.57867956161499, + "learning_rate": 2.0393038216080433e-06, + "loss": 0.3293, + "num_input_tokens_seen": 16206584, + "step": 24815 + }, + { + "epoch": 14.63443396226415, + "grad_norm": 16.227115631103516, + "learning_rate": 2.037231028832135e-06, + "loss": 0.5062, + "num_input_tokens_seen": 16208984, + "step": 24820 + }, + { + "epoch": 14.637382075471699, + "grad_norm": 3.1904683113098145, + "learning_rate": 2.0351590204620823e-06, + "loss": 0.3864, + "num_input_tokens_seen": 16211832, + "step": 24825 + }, + { + "epoch": 14.640330188679245, + "grad_norm": 4.230688095092773, + "learning_rate": 2.033087797046457e-06, + "loss": 0.3588, + "num_input_tokens_seen": 16215160, + "step": 24830 + }, + { + "epoch": 14.643278301886792, + "grad_norm": 2.4691450595855713, + "learning_rate": 2.031017359133624e-06, + "loss": 0.4272, + "num_input_tokens_seen": 16218904, + "step": 24835 + }, + { + "epoch": 14.64622641509434, + "grad_norm": 4.355981826782227, + "learning_rate": 2.0289477072717406e-06, + "loss": 0.2499, + "num_input_tokens_seen": 16222552, + "step": 24840 + }, + { + "epoch": 14.649174528301886, + "grad_norm": 4.029301643371582, + "learning_rate": 2.026878842008756e-06, + "loss": 0.2778, + "num_input_tokens_seen": 16225496, + "step": 24845 + }, + { + "epoch": 14.652122641509434, + "grad_norm": 3.060234308242798, + "learning_rate": 2.0248107638924105e-06, + "loss": 0.3617, + "num_input_tokens_seen": 16228728, + "step": 24850 + }, + { + "epoch": 14.65507075471698, + "grad_norm": 3.358344793319702, + "learning_rate": 2.0227434734702386e-06, + "loss": 0.3171, + "num_input_tokens_seen": 16231960, + "step": 24855 + }, + { + "epoch": 14.658018867924529, + "grad_norm": 2.5629827976226807, + "learning_rate": 2.020676971289563e-06, + "loss": 0.3514, + "num_input_tokens_seen": 16236088, + "step": 24860 + }, + { + "epoch": 14.660966981132075, + "grad_norm": 2.758998155593872, + "learning_rate": 2.0186112578975005e-06, + "loss": 0.3923, + "num_input_tokens_seen": 16241336, + "step": 24865 + }, + { + "epoch": 14.663915094339622, + "grad_norm": 3.4192328453063965, + "learning_rate": 2.016546333840956e-06, + "loss": 0.4374, + "num_input_tokens_seen": 16244088, + "step": 24870 + }, + { + "epoch": 14.66686320754717, + "grad_norm": 4.319182395935059, + "learning_rate": 2.014482199666627e-06, + "loss": 0.3159, + "num_input_tokens_seen": 16247768, + "step": 24875 + }, + { + "epoch": 14.669811320754716, + "grad_norm": 5.0578155517578125, + "learning_rate": 2.0124188559210017e-06, + "loss": 0.4157, + "num_input_tokens_seen": 16250552, + "step": 24880 + }, + { + "epoch": 14.672759433962264, + "grad_norm": 3.473557472229004, + "learning_rate": 2.0103563031503613e-06, + "loss": 0.3023, + "num_input_tokens_seen": 16253880, + "step": 24885 + }, + { + "epoch": 14.67570754716981, + "grad_norm": 3.932966470718384, + "learning_rate": 2.0082945419007745e-06, + "loss": 0.2616, + "num_input_tokens_seen": 16256952, + "step": 24890 + }, + { + "epoch": 14.678655660377359, + "grad_norm": 3.7742600440979004, + "learning_rate": 2.0062335727181007e-06, + "loss": 0.262, + "num_input_tokens_seen": 16261112, + "step": 24895 + }, + { + "epoch": 14.681603773584905, + "grad_norm": 4.404270648956299, + "learning_rate": 2.004173396147992e-06, + "loss": 0.2901, + "num_input_tokens_seen": 16263832, + "step": 24900 + }, + { + "epoch": 14.684551886792454, + "grad_norm": 3.263373613357544, + "learning_rate": 2.0021140127358873e-06, + "loss": 0.2831, + "num_input_tokens_seen": 16267320, + "step": 24905 + }, + { + "epoch": 14.6875, + "grad_norm": 7.989091396331787, + "learning_rate": 2.0000554230270164e-06, + "loss": 0.3113, + "num_input_tokens_seen": 16270296, + "step": 24910 + }, + { + "epoch": 14.690448113207546, + "grad_norm": 7.270644664764404, + "learning_rate": 1.997997627566401e-06, + "loss": 0.2193, + "num_input_tokens_seen": 16273144, + "step": 24915 + }, + { + "epoch": 14.693396226415095, + "grad_norm": 3.6785807609558105, + "learning_rate": 1.9959406268988536e-06, + "loss": 0.2916, + "num_input_tokens_seen": 16276344, + "step": 24920 + }, + { + "epoch": 14.696344339622641, + "grad_norm": 3.2129178047180176, + "learning_rate": 1.9938844215689717e-06, + "loss": 0.2714, + "num_input_tokens_seen": 16279544, + "step": 24925 + }, + { + "epoch": 14.69929245283019, + "grad_norm": 2.594600200653076, + "learning_rate": 1.991829012121145e-06, + "loss": 0.2246, + "num_input_tokens_seen": 16283128, + "step": 24930 + }, + { + "epoch": 14.702240566037736, + "grad_norm": 3.7569234371185303, + "learning_rate": 1.989774399099552e-06, + "loss": 0.3031, + "num_input_tokens_seen": 16285880, + "step": 24935 + }, + { + "epoch": 14.705188679245284, + "grad_norm": 7.433042049407959, + "learning_rate": 1.98772058304816e-06, + "loss": 0.3746, + "num_input_tokens_seen": 16288472, + "step": 24940 + }, + { + "epoch": 14.70813679245283, + "grad_norm": 5.312839508056641, + "learning_rate": 1.9856675645107244e-06, + "loss": 0.4514, + "num_input_tokens_seen": 16291256, + "step": 24945 + }, + { + "epoch": 14.711084905660378, + "grad_norm": 4.346865177154541, + "learning_rate": 1.9836153440307936e-06, + "loss": 0.2961, + "num_input_tokens_seen": 16294872, + "step": 24950 + }, + { + "epoch": 14.714033018867925, + "grad_norm": 5.7718281745910645, + "learning_rate": 1.9815639221517002e-06, + "loss": 0.269, + "num_input_tokens_seen": 16297688, + "step": 24955 + }, + { + "epoch": 14.716981132075471, + "grad_norm": 2.6061019897460938, + "learning_rate": 1.9795132994165673e-06, + "loss": 0.3559, + "num_input_tokens_seen": 16301208, + "step": 24960 + }, + { + "epoch": 14.71992924528302, + "grad_norm": 4.009645938873291, + "learning_rate": 1.977463476368306e-06, + "loss": 0.2998, + "num_input_tokens_seen": 16303736, + "step": 24965 + }, + { + "epoch": 14.722877358490566, + "grad_norm": 3.154634952545166, + "learning_rate": 1.975414453549614e-06, + "loss": 0.2754, + "num_input_tokens_seen": 16306616, + "step": 24970 + }, + { + "epoch": 14.725825471698114, + "grad_norm": 3.1322290897369385, + "learning_rate": 1.9733662315029826e-06, + "loss": 0.3963, + "num_input_tokens_seen": 16310040, + "step": 24975 + }, + { + "epoch": 14.72877358490566, + "grad_norm": 3.957031726837158, + "learning_rate": 1.9713188107706856e-06, + "loss": 0.3532, + "num_input_tokens_seen": 16312856, + "step": 24980 + }, + { + "epoch": 14.731721698113208, + "grad_norm": 2.846776008605957, + "learning_rate": 1.969272191894786e-06, + "loss": 0.2758, + "num_input_tokens_seen": 16315640, + "step": 24985 + }, + { + "epoch": 14.734669811320755, + "grad_norm": 3.068634271621704, + "learning_rate": 1.967226375417135e-06, + "loss": 0.297, + "num_input_tokens_seen": 16318680, + "step": 24990 + }, + { + "epoch": 14.737617924528301, + "grad_norm": 3.493964672088623, + "learning_rate": 1.965181361879372e-06, + "loss": 0.3126, + "num_input_tokens_seen": 16321848, + "step": 24995 + }, + { + "epoch": 14.74056603773585, + "grad_norm": 4.2359619140625, + "learning_rate": 1.9631371518229214e-06, + "loss": 0.2886, + "num_input_tokens_seen": 16324600, + "step": 25000 + }, + { + "epoch": 14.743514150943396, + "grad_norm": 2.6380057334899902, + "learning_rate": 1.9610937457889975e-06, + "loss": 0.2822, + "num_input_tokens_seen": 16327896, + "step": 25005 + }, + { + "epoch": 14.746462264150944, + "grad_norm": 4.02836275100708, + "learning_rate": 1.9590511443186032e-06, + "loss": 0.3042, + "num_input_tokens_seen": 16331096, + "step": 25010 + }, + { + "epoch": 14.74941037735849, + "grad_norm": 3.8817296028137207, + "learning_rate": 1.9570093479525243e-06, + "loss": 0.3561, + "num_input_tokens_seen": 16333976, + "step": 25015 + }, + { + "epoch": 14.752358490566039, + "grad_norm": 4.128470420837402, + "learning_rate": 1.954968357231335e-06, + "loss": 0.4548, + "num_input_tokens_seen": 16337176, + "step": 25020 + }, + { + "epoch": 14.755306603773585, + "grad_norm": 4.137679576873779, + "learning_rate": 1.9529281726953964e-06, + "loss": 0.3339, + "num_input_tokens_seen": 16340312, + "step": 25025 + }, + { + "epoch": 14.758254716981131, + "grad_norm": 7.715548515319824, + "learning_rate": 1.9508887948848564e-06, + "loss": 0.3169, + "num_input_tokens_seen": 16344728, + "step": 25030 + }, + { + "epoch": 14.76120283018868, + "grad_norm": 3.4236059188842773, + "learning_rate": 1.9488502243396475e-06, + "loss": 0.2614, + "num_input_tokens_seen": 16347832, + "step": 25035 + }, + { + "epoch": 14.764150943396226, + "grad_norm": 7.7521257400512695, + "learning_rate": 1.946812461599492e-06, + "loss": 0.2985, + "num_input_tokens_seen": 16350584, + "step": 25040 + }, + { + "epoch": 14.767099056603774, + "grad_norm": 3.282959222793579, + "learning_rate": 1.944775507203897e-06, + "loss": 0.2169, + "num_input_tokens_seen": 16354104, + "step": 25045 + }, + { + "epoch": 14.77004716981132, + "grad_norm": 2.107105016708374, + "learning_rate": 1.942739361692153e-06, + "loss": 0.2626, + "num_input_tokens_seen": 16357048, + "step": 25050 + }, + { + "epoch": 14.772995283018869, + "grad_norm": 4.1882524490356445, + "learning_rate": 1.94070402560334e-06, + "loss": 0.4593, + "num_input_tokens_seen": 16361048, + "step": 25055 + }, + { + "epoch": 14.775943396226415, + "grad_norm": 3.467486619949341, + "learning_rate": 1.93866949947632e-06, + "loss": 0.2883, + "num_input_tokens_seen": 16363864, + "step": 25060 + }, + { + "epoch": 14.778891509433961, + "grad_norm": 2.0222299098968506, + "learning_rate": 1.9366357838497423e-06, + "loss": 0.3646, + "num_input_tokens_seen": 16367608, + "step": 25065 + }, + { + "epoch": 14.78183962264151, + "grad_norm": 2.681932210922241, + "learning_rate": 1.9346028792620454e-06, + "loss": 0.4128, + "num_input_tokens_seen": 16371928, + "step": 25070 + }, + { + "epoch": 14.784787735849056, + "grad_norm": 3.1201298236846924, + "learning_rate": 1.9325707862514464e-06, + "loss": 0.3401, + "num_input_tokens_seen": 16375544, + "step": 25075 + }, + { + "epoch": 14.787735849056604, + "grad_norm": 2.4026060104370117, + "learning_rate": 1.930539505355952e-06, + "loss": 0.3298, + "num_input_tokens_seen": 16378648, + "step": 25080 + }, + { + "epoch": 14.79068396226415, + "grad_norm": 2.7981984615325928, + "learning_rate": 1.9285090371133524e-06, + "loss": 0.4165, + "num_input_tokens_seen": 16381880, + "step": 25085 + }, + { + "epoch": 14.793632075471699, + "grad_norm": 3.097672462463379, + "learning_rate": 1.9264793820612228e-06, + "loss": 0.4654, + "num_input_tokens_seen": 16386296, + "step": 25090 + }, + { + "epoch": 14.796580188679245, + "grad_norm": 2.9298319816589355, + "learning_rate": 1.924450540736921e-06, + "loss": 0.2619, + "num_input_tokens_seen": 16389272, + "step": 25095 + }, + { + "epoch": 14.799528301886792, + "grad_norm": 3.1842098236083984, + "learning_rate": 1.922422513677593e-06, + "loss": 0.3192, + "num_input_tokens_seen": 16392664, + "step": 25100 + }, + { + "epoch": 14.80247641509434, + "grad_norm": 2.408630132675171, + "learning_rate": 1.9203953014201703e-06, + "loss": 0.4344, + "num_input_tokens_seen": 16396536, + "step": 25105 + }, + { + "epoch": 14.805424528301886, + "grad_norm": 2.8250129222869873, + "learning_rate": 1.918368904501364e-06, + "loss": 0.2464, + "num_input_tokens_seen": 16400952, + "step": 25110 + }, + { + "epoch": 14.808372641509434, + "grad_norm": 4.00761604309082, + "learning_rate": 1.9163433234576713e-06, + "loss": 0.4455, + "num_input_tokens_seen": 16403864, + "step": 25115 + }, + { + "epoch": 14.81132075471698, + "grad_norm": 4.205718994140625, + "learning_rate": 1.9143185588253733e-06, + "loss": 0.3224, + "num_input_tokens_seen": 16406648, + "step": 25120 + }, + { + "epoch": 14.814268867924529, + "grad_norm": 2.6720190048217773, + "learning_rate": 1.9122946111405354e-06, + "loss": 0.3417, + "num_input_tokens_seen": 16409368, + "step": 25125 + }, + { + "epoch": 14.817216981132075, + "grad_norm": 4.602489948272705, + "learning_rate": 1.910271480939005e-06, + "loss": 0.2512, + "num_input_tokens_seen": 16411832, + "step": 25130 + }, + { + "epoch": 14.820165094339622, + "grad_norm": 4.8560791015625, + "learning_rate": 1.9082491687564176e-06, + "loss": 0.2818, + "num_input_tokens_seen": 16416504, + "step": 25135 + }, + { + "epoch": 14.82311320754717, + "grad_norm": 5.241753101348877, + "learning_rate": 1.9062276751281872e-06, + "loss": 0.3733, + "num_input_tokens_seen": 16419000, + "step": 25140 + }, + { + "epoch": 14.826061320754716, + "grad_norm": 1.7326551675796509, + "learning_rate": 1.9042070005895136e-06, + "loss": 0.4264, + "num_input_tokens_seen": 16423064, + "step": 25145 + }, + { + "epoch": 14.829009433962264, + "grad_norm": 4.018425464630127, + "learning_rate": 1.9021871456753788e-06, + "loss": 0.2879, + "num_input_tokens_seen": 16426200, + "step": 25150 + }, + { + "epoch": 14.83195754716981, + "grad_norm": 3.9319965839385986, + "learning_rate": 1.9001681109205478e-06, + "loss": 0.2652, + "num_input_tokens_seen": 16429048, + "step": 25155 + }, + { + "epoch": 14.834905660377359, + "grad_norm": 5.459967613220215, + "learning_rate": 1.898149896859567e-06, + "loss": 0.426, + "num_input_tokens_seen": 16431768, + "step": 25160 + }, + { + "epoch": 14.837853773584905, + "grad_norm": 1.9723478555679321, + "learning_rate": 1.8961325040267714e-06, + "loss": 0.2506, + "num_input_tokens_seen": 16434904, + "step": 25165 + }, + { + "epoch": 14.840801886792454, + "grad_norm": 3.8622987270355225, + "learning_rate": 1.894115932956272e-06, + "loss": 0.2935, + "num_input_tokens_seen": 16437912, + "step": 25170 + }, + { + "epoch": 14.84375, + "grad_norm": 3.6371262073516846, + "learning_rate": 1.8921001841819652e-06, + "loss": 0.3062, + "num_input_tokens_seen": 16441208, + "step": 25175 + }, + { + "epoch": 14.846698113207546, + "grad_norm": 7.932714939117432, + "learning_rate": 1.8900852582375284e-06, + "loss": 0.3655, + "num_input_tokens_seen": 16444568, + "step": 25180 + }, + { + "epoch": 14.849646226415095, + "grad_norm": 4.321752071380615, + "learning_rate": 1.8880711556564214e-06, + "loss": 0.2747, + "num_input_tokens_seen": 16448216, + "step": 25185 + }, + { + "epoch": 14.852594339622641, + "grad_norm": 3.3801183700561523, + "learning_rate": 1.8860578769718891e-06, + "loss": 0.3763, + "num_input_tokens_seen": 16450712, + "step": 25190 + }, + { + "epoch": 14.85554245283019, + "grad_norm": 5.338661193847656, + "learning_rate": 1.8840454227169525e-06, + "loss": 0.398, + "num_input_tokens_seen": 16453880, + "step": 25195 + }, + { + "epoch": 14.858490566037736, + "grad_norm": 6.127860069274902, + "learning_rate": 1.882033793424421e-06, + "loss": 0.3009, + "num_input_tokens_seen": 16456856, + "step": 25200 + }, + { + "epoch": 14.861438679245284, + "grad_norm": 2.8764519691467285, + "learning_rate": 1.88002298962688e-06, + "loss": 0.2878, + "num_input_tokens_seen": 16459704, + "step": 25205 + }, + { + "epoch": 14.86438679245283, + "grad_norm": 3.58858585357666, + "learning_rate": 1.8780130118566996e-06, + "loss": 0.3166, + "num_input_tokens_seen": 16462680, + "step": 25210 + }, + { + "epoch": 14.867334905660378, + "grad_norm": 2.39900279045105, + "learning_rate": 1.876003860646029e-06, + "loss": 0.2231, + "num_input_tokens_seen": 16465528, + "step": 25215 + }, + { + "epoch": 14.870283018867925, + "grad_norm": 4.4556379318237305, + "learning_rate": 1.8739955365267997e-06, + "loss": 0.319, + "num_input_tokens_seen": 16468280, + "step": 25220 + }, + { + "epoch": 14.873231132075471, + "grad_norm": 2.7291786670684814, + "learning_rate": 1.8719880400307228e-06, + "loss": 0.3685, + "num_input_tokens_seen": 16471864, + "step": 25225 + }, + { + "epoch": 14.87617924528302, + "grad_norm": 4.0243659019470215, + "learning_rate": 1.869981371689295e-06, + "loss": 0.336, + "num_input_tokens_seen": 16475992, + "step": 25230 + }, + { + "epoch": 14.879127358490566, + "grad_norm": 3.0323448181152344, + "learning_rate": 1.867975532033789e-06, + "loss": 0.3583, + "num_input_tokens_seen": 16480120, + "step": 25235 + }, + { + "epoch": 14.882075471698114, + "grad_norm": 1.9534499645233154, + "learning_rate": 1.8659705215952589e-06, + "loss": 0.301, + "num_input_tokens_seen": 16483512, + "step": 25240 + }, + { + "epoch": 14.88502358490566, + "grad_norm": 1.823805570602417, + "learning_rate": 1.8639663409045405e-06, + "loss": 0.3772, + "num_input_tokens_seen": 16486296, + "step": 25245 + }, + { + "epoch": 14.887971698113208, + "grad_norm": 2.388416290283203, + "learning_rate": 1.8619629904922466e-06, + "loss": 0.3067, + "num_input_tokens_seen": 16490008, + "step": 25250 + }, + { + "epoch": 14.890919811320755, + "grad_norm": 3.073296308517456, + "learning_rate": 1.859960470888777e-06, + "loss": 0.3912, + "num_input_tokens_seen": 16493112, + "step": 25255 + }, + { + "epoch": 14.893867924528301, + "grad_norm": 4.864343166351318, + "learning_rate": 1.857958782624306e-06, + "loss": 0.5101, + "num_input_tokens_seen": 16495640, + "step": 25260 + }, + { + "epoch": 14.89681603773585, + "grad_norm": 3.0970358848571777, + "learning_rate": 1.8559579262287886e-06, + "loss": 0.4065, + "num_input_tokens_seen": 16498040, + "step": 25265 + }, + { + "epoch": 14.899764150943396, + "grad_norm": 4.0576276779174805, + "learning_rate": 1.8539579022319599e-06, + "loss": 0.2834, + "num_input_tokens_seen": 16501240, + "step": 25270 + }, + { + "epoch": 14.902712264150944, + "grad_norm": 4.132444858551025, + "learning_rate": 1.8519587111633357e-06, + "loss": 0.3418, + "num_input_tokens_seen": 16503896, + "step": 25275 + }, + { + "epoch": 14.90566037735849, + "grad_norm": 4.599870681762695, + "learning_rate": 1.8499603535522082e-06, + "loss": 0.3834, + "num_input_tokens_seen": 16506296, + "step": 25280 + }, + { + "epoch": 14.908608490566039, + "grad_norm": 3.0442163944244385, + "learning_rate": 1.8479628299276543e-06, + "loss": 0.2442, + "num_input_tokens_seen": 16509336, + "step": 25285 + }, + { + "epoch": 14.911556603773585, + "grad_norm": 2.712388038635254, + "learning_rate": 1.8459661408185241e-06, + "loss": 0.2495, + "num_input_tokens_seen": 16512920, + "step": 25290 + }, + { + "epoch": 14.914504716981131, + "grad_norm": 6.849637508392334, + "learning_rate": 1.8439702867534536e-06, + "loss": 0.2916, + "num_input_tokens_seen": 16515704, + "step": 25295 + }, + { + "epoch": 14.91745283018868, + "grad_norm": 4.027571678161621, + "learning_rate": 1.841975268260851e-06, + "loss": 0.2867, + "num_input_tokens_seen": 16518360, + "step": 25300 + }, + { + "epoch": 14.920400943396226, + "grad_norm": 6.9468913078308105, + "learning_rate": 1.8399810858689066e-06, + "loss": 0.4697, + "num_input_tokens_seen": 16520888, + "step": 25305 + }, + { + "epoch": 14.923349056603774, + "grad_norm": 5.834597587585449, + "learning_rate": 1.8379877401055884e-06, + "loss": 0.278, + "num_input_tokens_seen": 16523480, + "step": 25310 + }, + { + "epoch": 14.92629716981132, + "grad_norm": 2.960740804672241, + "learning_rate": 1.8359952314986418e-06, + "loss": 0.4943, + "num_input_tokens_seen": 16526648, + "step": 25315 + }, + { + "epoch": 14.929245283018869, + "grad_norm": 3.694031000137329, + "learning_rate": 1.8340035605755957e-06, + "loss": 0.2614, + "num_input_tokens_seen": 16529688, + "step": 25320 + }, + { + "epoch": 14.932193396226415, + "grad_norm": 6.373254776000977, + "learning_rate": 1.8320127278637518e-06, + "loss": 0.3373, + "num_input_tokens_seen": 16533240, + "step": 25325 + }, + { + "epoch": 14.935141509433961, + "grad_norm": 6.575335502624512, + "learning_rate": 1.830022733890191e-06, + "loss": 0.4166, + "num_input_tokens_seen": 16536344, + "step": 25330 + }, + { + "epoch": 14.93808962264151, + "grad_norm": 6.3244123458862305, + "learning_rate": 1.8280335791817733e-06, + "loss": 0.2424, + "num_input_tokens_seen": 16539224, + "step": 25335 + }, + { + "epoch": 14.941037735849056, + "grad_norm": 2.961890459060669, + "learning_rate": 1.826045264265136e-06, + "loss": 0.2813, + "num_input_tokens_seen": 16543224, + "step": 25340 + }, + { + "epoch": 14.943985849056604, + "grad_norm": 5.144169807434082, + "learning_rate": 1.8240577896666928e-06, + "loss": 0.3727, + "num_input_tokens_seen": 16546744, + "step": 25345 + }, + { + "epoch": 14.94693396226415, + "grad_norm": 3.771613597869873, + "learning_rate": 1.8220711559126382e-06, + "loss": 0.3105, + "num_input_tokens_seen": 16549336, + "step": 25350 + }, + { + "epoch": 14.949882075471699, + "grad_norm": 3.303913116455078, + "learning_rate": 1.8200853635289417e-06, + "loss": 0.2301, + "num_input_tokens_seen": 16552248, + "step": 25355 + }, + { + "epoch": 14.952830188679245, + "grad_norm": 3.0048983097076416, + "learning_rate": 1.81810041304135e-06, + "loss": 0.2863, + "num_input_tokens_seen": 16555800, + "step": 25360 + }, + { + "epoch": 14.955778301886792, + "grad_norm": 4.158670425415039, + "learning_rate": 1.8161163049753865e-06, + "loss": 0.3241, + "num_input_tokens_seen": 16558904, + "step": 25365 + }, + { + "epoch": 14.95872641509434, + "grad_norm": 3.664205312728882, + "learning_rate": 1.8141330398563533e-06, + "loss": 0.3297, + "num_input_tokens_seen": 16561880, + "step": 25370 + }, + { + "epoch": 14.961674528301886, + "grad_norm": 3.264677047729492, + "learning_rate": 1.8121506182093268e-06, + "loss": 0.3352, + "num_input_tokens_seen": 16565176, + "step": 25375 + }, + { + "epoch": 14.964622641509434, + "grad_norm": 3.20595645904541, + "learning_rate": 1.8101690405591643e-06, + "loss": 0.3685, + "num_input_tokens_seen": 16567864, + "step": 25380 + }, + { + "epoch": 14.96757075471698, + "grad_norm": 1.9541380405426025, + "learning_rate": 1.8081883074304945e-06, + "loss": 0.2538, + "num_input_tokens_seen": 16571256, + "step": 25385 + }, + { + "epoch": 14.970518867924529, + "grad_norm": 1.7414108514785767, + "learning_rate": 1.8062084193477275e-06, + "loss": 0.2816, + "num_input_tokens_seen": 16575224, + "step": 25390 + }, + { + "epoch": 14.973466981132075, + "grad_norm": 4.276235580444336, + "learning_rate": 1.804229376835046e-06, + "loss": 0.2987, + "num_input_tokens_seen": 16580056, + "step": 25395 + }, + { + "epoch": 14.976415094339622, + "grad_norm": 2.6271822452545166, + "learning_rate": 1.8022511804164105e-06, + "loss": 0.3798, + "num_input_tokens_seen": 16583448, + "step": 25400 + }, + { + "epoch": 14.97936320754717, + "grad_norm": 2.8101799488067627, + "learning_rate": 1.8002738306155559e-06, + "loss": 0.3536, + "num_input_tokens_seen": 16586488, + "step": 25405 + }, + { + "epoch": 14.982311320754716, + "grad_norm": 6.60940408706665, + "learning_rate": 1.7982973279559935e-06, + "loss": 0.2914, + "num_input_tokens_seen": 16589272, + "step": 25410 + }, + { + "epoch": 14.985259433962264, + "grad_norm": 5.9747314453125, + "learning_rate": 1.7963216729610134e-06, + "loss": 0.2614, + "num_input_tokens_seen": 16593368, + "step": 25415 + }, + { + "epoch": 14.98820754716981, + "grad_norm": 3.487112045288086, + "learning_rate": 1.7943468661536773e-06, + "loss": 0.2487, + "num_input_tokens_seen": 16596056, + "step": 25420 + }, + { + "epoch": 14.991155660377359, + "grad_norm": 3.2089550495147705, + "learning_rate": 1.7923729080568242e-06, + "loss": 0.4671, + "num_input_tokens_seen": 16599192, + "step": 25425 + }, + { + "epoch": 14.994103773584905, + "grad_norm": 4.636631965637207, + "learning_rate": 1.7903997991930683e-06, + "loss": 0.3711, + "num_input_tokens_seen": 16602360, + "step": 25430 + }, + { + "epoch": 14.997051886792454, + "grad_norm": 3.2209982872009277, + "learning_rate": 1.7884275400847972e-06, + "loss": 0.2845, + "num_input_tokens_seen": 16605816, + "step": 25435 + }, + { + "epoch": 15.0, + "grad_norm": 9.4528226852417, + "learning_rate": 1.786456131254175e-06, + "loss": 0.2957, + "num_input_tokens_seen": 16608128, + "step": 25440 + }, + { + "epoch": 15.002948113207546, + "grad_norm": 4.454707145690918, + "learning_rate": 1.784485573223143e-06, + "loss": 0.3144, + "num_input_tokens_seen": 16611392, + "step": 25445 + }, + { + "epoch": 15.005896226415095, + "grad_norm": 2.239672899246216, + "learning_rate": 1.782515866513414e-06, + "loss": 0.3029, + "num_input_tokens_seen": 16614944, + "step": 25450 + }, + { + "epoch": 15.008844339622641, + "grad_norm": 2.878877878189087, + "learning_rate": 1.7805470116464758e-06, + "loss": 0.2815, + "num_input_tokens_seen": 16618240, + "step": 25455 + }, + { + "epoch": 15.01179245283019, + "grad_norm": 4.851008415222168, + "learning_rate": 1.7785790091435911e-06, + "loss": 0.29, + "num_input_tokens_seen": 16620832, + "step": 25460 + }, + { + "epoch": 15.014740566037736, + "grad_norm": 3.1711878776550293, + "learning_rate": 1.776611859525796e-06, + "loss": 0.3699, + "num_input_tokens_seen": 16623616, + "step": 25465 + }, + { + "epoch": 15.017688679245284, + "grad_norm": 3.8151462078094482, + "learning_rate": 1.7746455633139042e-06, + "loss": 0.3083, + "num_input_tokens_seen": 16626560, + "step": 25470 + }, + { + "epoch": 15.02063679245283, + "grad_norm": 4.4690632820129395, + "learning_rate": 1.7726801210285005e-06, + "loss": 0.4059, + "num_input_tokens_seen": 16629632, + "step": 25475 + }, + { + "epoch": 15.023584905660377, + "grad_norm": 2.7234582901000977, + "learning_rate": 1.7707155331899418e-06, + "loss": 0.3657, + "num_input_tokens_seen": 16632576, + "step": 25480 + }, + { + "epoch": 15.026533018867925, + "grad_norm": 4.251256465911865, + "learning_rate": 1.7687518003183645e-06, + "loss": 0.2627, + "num_input_tokens_seen": 16637632, + "step": 25485 + }, + { + "epoch": 15.029481132075471, + "grad_norm": 4.878654956817627, + "learning_rate": 1.766788922933675e-06, + "loss": 0.2495, + "num_input_tokens_seen": 16640192, + "step": 25490 + }, + { + "epoch": 15.03242924528302, + "grad_norm": 4.349775314331055, + "learning_rate": 1.7648269015555514e-06, + "loss": 0.3387, + "num_input_tokens_seen": 16643200, + "step": 25495 + }, + { + "epoch": 15.035377358490566, + "grad_norm": 2.7638635635375977, + "learning_rate": 1.7628657367034474e-06, + "loss": 0.2781, + "num_input_tokens_seen": 16647584, + "step": 25500 + }, + { + "epoch": 15.038325471698114, + "grad_norm": 4.539735317230225, + "learning_rate": 1.7609054288965922e-06, + "loss": 0.3434, + "num_input_tokens_seen": 16650592, + "step": 25505 + }, + { + "epoch": 15.04127358490566, + "grad_norm": 2.626659870147705, + "learning_rate": 1.7589459786539847e-06, + "loss": 0.2704, + "num_input_tokens_seen": 16653824, + "step": 25510 + }, + { + "epoch": 15.044221698113208, + "grad_norm": 3.5171616077423096, + "learning_rate": 1.7569873864943975e-06, + "loss": 0.3218, + "num_input_tokens_seen": 16658624, + "step": 25515 + }, + { + "epoch": 15.047169811320755, + "grad_norm": 3.7473433017730713, + "learning_rate": 1.7550296529363764e-06, + "loss": 0.2467, + "num_input_tokens_seen": 16662144, + "step": 25520 + }, + { + "epoch": 15.050117924528301, + "grad_norm": 1.9571799039840698, + "learning_rate": 1.7530727784982393e-06, + "loss": 0.3154, + "num_input_tokens_seen": 16666208, + "step": 25525 + }, + { + "epoch": 15.05306603773585, + "grad_norm": 3.209031820297241, + "learning_rate": 1.7511167636980765e-06, + "loss": 0.2863, + "num_input_tokens_seen": 16669728, + "step": 25530 + }, + { + "epoch": 15.056014150943396, + "grad_norm": 2.7215023040771484, + "learning_rate": 1.7491616090537539e-06, + "loss": 0.2888, + "num_input_tokens_seen": 16673120, + "step": 25535 + }, + { + "epoch": 15.058962264150944, + "grad_norm": 3.417344570159912, + "learning_rate": 1.7472073150829056e-06, + "loss": 0.3318, + "num_input_tokens_seen": 16678144, + "step": 25540 + }, + { + "epoch": 15.06191037735849, + "grad_norm": 4.313814163208008, + "learning_rate": 1.745253882302939e-06, + "loss": 0.3733, + "num_input_tokens_seen": 16681312, + "step": 25545 + }, + { + "epoch": 15.064858490566039, + "grad_norm": 3.0025970935821533, + "learning_rate": 1.743301311231035e-06, + "loss": 0.305, + "num_input_tokens_seen": 16685216, + "step": 25550 + }, + { + "epoch": 15.067806603773585, + "grad_norm": 4.73638916015625, + "learning_rate": 1.7413496023841437e-06, + "loss": 0.2366, + "num_input_tokens_seen": 16688736, + "step": 25555 + }, + { + "epoch": 15.070754716981131, + "grad_norm": 2.8894217014312744, + "learning_rate": 1.7393987562789876e-06, + "loss": 0.2316, + "num_input_tokens_seen": 16692000, + "step": 25560 + }, + { + "epoch": 15.07370283018868, + "grad_norm": 4.167801856994629, + "learning_rate": 1.7374487734320655e-06, + "loss": 0.3287, + "num_input_tokens_seen": 16694912, + "step": 25565 + }, + { + "epoch": 15.076650943396226, + "grad_norm": 3.0136146545410156, + "learning_rate": 1.7354996543596408e-06, + "loss": 0.2694, + "num_input_tokens_seen": 16698016, + "step": 25570 + }, + { + "epoch": 15.079599056603774, + "grad_norm": 2.9114134311676025, + "learning_rate": 1.7335513995777504e-06, + "loss": 0.2626, + "num_input_tokens_seen": 16701088, + "step": 25575 + }, + { + "epoch": 15.08254716981132, + "grad_norm": 3.192774772644043, + "learning_rate": 1.7316040096022062e-06, + "loss": 0.364, + "num_input_tokens_seen": 16703552, + "step": 25580 + }, + { + "epoch": 15.085495283018869, + "grad_norm": 4.4815673828125, + "learning_rate": 1.7296574849485863e-06, + "loss": 0.284, + "num_input_tokens_seen": 16706848, + "step": 25585 + }, + { + "epoch": 15.088443396226415, + "grad_norm": 4.820929527282715, + "learning_rate": 1.7277118261322423e-06, + "loss": 0.2834, + "num_input_tokens_seen": 16709664, + "step": 25590 + }, + { + "epoch": 15.091391509433961, + "grad_norm": 9.346332550048828, + "learning_rate": 1.7257670336682925e-06, + "loss": 0.2542, + "num_input_tokens_seen": 16712896, + "step": 25595 + }, + { + "epoch": 15.09433962264151, + "grad_norm": 5.682065486907959, + "learning_rate": 1.7238231080716339e-06, + "loss": 0.4803, + "num_input_tokens_seen": 16715680, + "step": 25600 + }, + { + "epoch": 15.097287735849056, + "grad_norm": 3.4871997833251953, + "learning_rate": 1.721880049856927e-06, + "loss": 0.3053, + "num_input_tokens_seen": 16718848, + "step": 25605 + }, + { + "epoch": 15.100235849056604, + "grad_norm": 7.554766654968262, + "learning_rate": 1.7199378595386046e-06, + "loss": 0.2962, + "num_input_tokens_seen": 16721600, + "step": 25610 + }, + { + "epoch": 15.10318396226415, + "grad_norm": 3.130730152130127, + "learning_rate": 1.7179965376308705e-06, + "loss": 0.2975, + "num_input_tokens_seen": 16724224, + "step": 25615 + }, + { + "epoch": 15.106132075471699, + "grad_norm": 3.5021908283233643, + "learning_rate": 1.7160560846476976e-06, + "loss": 0.2512, + "num_input_tokens_seen": 16728096, + "step": 25620 + }, + { + "epoch": 15.109080188679245, + "grad_norm": 2.9954609870910645, + "learning_rate": 1.7141165011028277e-06, + "loss": 0.2439, + "num_input_tokens_seen": 16731168, + "step": 25625 + }, + { + "epoch": 15.112028301886792, + "grad_norm": 2.9942305088043213, + "learning_rate": 1.7121777875097767e-06, + "loss": 0.2955, + "num_input_tokens_seen": 16734720, + "step": 25630 + }, + { + "epoch": 15.11497641509434, + "grad_norm": 3.5002331733703613, + "learning_rate": 1.7102399443818268e-06, + "loss": 0.2936, + "num_input_tokens_seen": 16738496, + "step": 25635 + }, + { + "epoch": 15.117924528301886, + "grad_norm": 4.55237340927124, + "learning_rate": 1.7083029722320294e-06, + "loss": 0.4011, + "num_input_tokens_seen": 16742848, + "step": 25640 + }, + { + "epoch": 15.120872641509434, + "grad_norm": 3.385624408721924, + "learning_rate": 1.7063668715732063e-06, + "loss": 0.3005, + "num_input_tokens_seen": 16746240, + "step": 25645 + }, + { + "epoch": 15.12382075471698, + "grad_norm": 2.2249433994293213, + "learning_rate": 1.7044316429179492e-06, + "loss": 0.3792, + "num_input_tokens_seen": 16750528, + "step": 25650 + }, + { + "epoch": 15.126768867924529, + "grad_norm": 2.7778255939483643, + "learning_rate": 1.7024972867786155e-06, + "loss": 0.2816, + "num_input_tokens_seen": 16755680, + "step": 25655 + }, + { + "epoch": 15.129716981132075, + "grad_norm": 2.397848606109619, + "learning_rate": 1.7005638036673389e-06, + "loss": 0.2923, + "num_input_tokens_seen": 16758432, + "step": 25660 + }, + { + "epoch": 15.132665094339623, + "grad_norm": 4.942976951599121, + "learning_rate": 1.6986311940960148e-06, + "loss": 0.3664, + "num_input_tokens_seen": 16760896, + "step": 25665 + }, + { + "epoch": 15.13561320754717, + "grad_norm": 5.957855224609375, + "learning_rate": 1.696699458576308e-06, + "loss": 0.3304, + "num_input_tokens_seen": 16764032, + "step": 25670 + }, + { + "epoch": 15.138561320754716, + "grad_norm": 4.301231384277344, + "learning_rate": 1.6947685976196581e-06, + "loss": 0.3979, + "num_input_tokens_seen": 16766912, + "step": 25675 + }, + { + "epoch": 15.141509433962264, + "grad_norm": 2.506430149078369, + "learning_rate": 1.692838611737267e-06, + "loss": 0.3337, + "num_input_tokens_seen": 16770304, + "step": 25680 + }, + { + "epoch": 15.14445754716981, + "grad_norm": 5.286166667938232, + "learning_rate": 1.690909501440106e-06, + "loss": 0.3284, + "num_input_tokens_seen": 16772768, + "step": 25685 + }, + { + "epoch": 15.147405660377359, + "grad_norm": 2.4631235599517822, + "learning_rate": 1.688981267238915e-06, + "loss": 0.2909, + "num_input_tokens_seen": 16775936, + "step": 25690 + }, + { + "epoch": 15.150353773584905, + "grad_norm": 6.819662570953369, + "learning_rate": 1.687053909644204e-06, + "loss": 0.3852, + "num_input_tokens_seen": 16778784, + "step": 25695 + }, + { + "epoch": 15.153301886792454, + "grad_norm": 5.26468563079834, + "learning_rate": 1.685127429166249e-06, + "loss": 0.3124, + "num_input_tokens_seen": 16782176, + "step": 25700 + }, + { + "epoch": 15.15625, + "grad_norm": 3.489720582962036, + "learning_rate": 1.683201826315093e-06, + "loss": 0.389, + "num_input_tokens_seen": 16785344, + "step": 25705 + }, + { + "epoch": 15.159198113207546, + "grad_norm": 3.3917508125305176, + "learning_rate": 1.681277101600548e-06, + "loss": 0.4336, + "num_input_tokens_seen": 16788032, + "step": 25710 + }, + { + "epoch": 15.162146226415095, + "grad_norm": 2.298631429672241, + "learning_rate": 1.6793532555321939e-06, + "loss": 0.3817, + "num_input_tokens_seen": 16791648, + "step": 25715 + }, + { + "epoch": 15.165094339622641, + "grad_norm": 3.780439853668213, + "learning_rate": 1.6774302886193744e-06, + "loss": 0.4318, + "num_input_tokens_seen": 16794368, + "step": 25720 + }, + { + "epoch": 15.16804245283019, + "grad_norm": 5.0596022605896, + "learning_rate": 1.6755082013712076e-06, + "loss": 0.3124, + "num_input_tokens_seen": 16796640, + "step": 25725 + }, + { + "epoch": 15.170990566037736, + "grad_norm": 4.355698108673096, + "learning_rate": 1.6735869942965716e-06, + "loss": 0.3717, + "num_input_tokens_seen": 16799616, + "step": 25730 + }, + { + "epoch": 15.173938679245284, + "grad_norm": 2.7480270862579346, + "learning_rate": 1.6716666679041155e-06, + "loss": 0.3702, + "num_input_tokens_seen": 16802784, + "step": 25735 + }, + { + "epoch": 15.17688679245283, + "grad_norm": 2.728898286819458, + "learning_rate": 1.6697472227022533e-06, + "loss": 0.2783, + "num_input_tokens_seen": 16805568, + "step": 25740 + }, + { + "epoch": 15.179834905660377, + "grad_norm": 2.5315985679626465, + "learning_rate": 1.6678286591991644e-06, + "loss": 0.2429, + "num_input_tokens_seen": 16809312, + "step": 25745 + }, + { + "epoch": 15.182783018867925, + "grad_norm": 8.58702564239502, + "learning_rate": 1.665910977902801e-06, + "loss": 0.4033, + "num_input_tokens_seen": 16811328, + "step": 25750 + }, + { + "epoch": 15.185731132075471, + "grad_norm": 3.6224279403686523, + "learning_rate": 1.6639941793208747e-06, + "loss": 0.357, + "num_input_tokens_seen": 16814144, + "step": 25755 + }, + { + "epoch": 15.18867924528302, + "grad_norm": 2.36954402923584, + "learning_rate": 1.6620782639608674e-06, + "loss": 0.2642, + "num_input_tokens_seen": 16817248, + "step": 25760 + }, + { + "epoch": 15.191627358490566, + "grad_norm": 2.8240365982055664, + "learning_rate": 1.6601632323300231e-06, + "loss": 0.3698, + "num_input_tokens_seen": 16820800, + "step": 25765 + }, + { + "epoch": 15.194575471698114, + "grad_norm": 3.197333574295044, + "learning_rate": 1.6582490849353595e-06, + "loss": 0.3494, + "num_input_tokens_seen": 16823872, + "step": 25770 + }, + { + "epoch": 15.19752358490566, + "grad_norm": 5.338634490966797, + "learning_rate": 1.6563358222836523e-06, + "loss": 0.4361, + "num_input_tokens_seen": 16826560, + "step": 25775 + }, + { + "epoch": 15.200471698113208, + "grad_norm": 3.3730363845825195, + "learning_rate": 1.654423444881445e-06, + "loss": 0.353, + "num_input_tokens_seen": 16829920, + "step": 25780 + }, + { + "epoch": 15.203419811320755, + "grad_norm": 6.791359901428223, + "learning_rate": 1.652511953235051e-06, + "loss": 0.4059, + "num_input_tokens_seen": 16833280, + "step": 25785 + }, + { + "epoch": 15.206367924528301, + "grad_norm": 4.881641864776611, + "learning_rate": 1.650601347850544e-06, + "loss": 0.2628, + "num_input_tokens_seen": 16837088, + "step": 25790 + }, + { + "epoch": 15.20931603773585, + "grad_norm": 5.139157772064209, + "learning_rate": 1.6486916292337652e-06, + "loss": 0.2907, + "num_input_tokens_seen": 16841024, + "step": 25795 + }, + { + "epoch": 15.212264150943396, + "grad_norm": 4.052705764770508, + "learning_rate": 1.6467827978903212e-06, + "loss": 0.285, + "num_input_tokens_seen": 16843648, + "step": 25800 + }, + { + "epoch": 15.215212264150944, + "grad_norm": 2.7996513843536377, + "learning_rate": 1.6448748543255827e-06, + "loss": 0.2227, + "num_input_tokens_seen": 16846528, + "step": 25805 + }, + { + "epoch": 15.21816037735849, + "grad_norm": 5.10759973526001, + "learning_rate": 1.6429677990446845e-06, + "loss": 0.2729, + "num_input_tokens_seen": 16849152, + "step": 25810 + }, + { + "epoch": 15.221108490566039, + "grad_norm": 4.511544704437256, + "learning_rate": 1.6410616325525319e-06, + "loss": 0.2593, + "num_input_tokens_seen": 16853312, + "step": 25815 + }, + { + "epoch": 15.224056603773585, + "grad_norm": 3.1804723739624023, + "learning_rate": 1.6391563553537875e-06, + "loss": 0.2888, + "num_input_tokens_seen": 16855968, + "step": 25820 + }, + { + "epoch": 15.227004716981131, + "grad_norm": 2.1584951877593994, + "learning_rate": 1.6372519679528832e-06, + "loss": 0.2297, + "num_input_tokens_seen": 16858816, + "step": 25825 + }, + { + "epoch": 15.22995283018868, + "grad_norm": 3.4638116359710693, + "learning_rate": 1.6353484708540124e-06, + "loss": 0.284, + "num_input_tokens_seen": 16862240, + "step": 25830 + }, + { + "epoch": 15.232900943396226, + "grad_norm": 3.020129919052124, + "learning_rate": 1.633445864561135e-06, + "loss": 0.216, + "num_input_tokens_seen": 16865312, + "step": 25835 + }, + { + "epoch": 15.235849056603774, + "grad_norm": 3.6519789695739746, + "learning_rate": 1.6315441495779726e-06, + "loss": 0.3221, + "num_input_tokens_seen": 16867936, + "step": 25840 + }, + { + "epoch": 15.23879716981132, + "grad_norm": 2.9586479663848877, + "learning_rate": 1.6296433264080152e-06, + "loss": 0.3484, + "num_input_tokens_seen": 16871968, + "step": 25845 + }, + { + "epoch": 15.241745283018869, + "grad_norm": 3.5766379833221436, + "learning_rate": 1.627743395554513e-06, + "loss": 0.2495, + "num_input_tokens_seen": 16874464, + "step": 25850 + }, + { + "epoch": 15.244693396226415, + "grad_norm": 3.1976966857910156, + "learning_rate": 1.6258443575204802e-06, + "loss": 0.3398, + "num_input_tokens_seen": 16878304, + "step": 25855 + }, + { + "epoch": 15.247641509433961, + "grad_norm": 4.662925720214844, + "learning_rate": 1.6239462128086936e-06, + "loss": 0.4084, + "num_input_tokens_seen": 16880992, + "step": 25860 + }, + { + "epoch": 15.25058962264151, + "grad_norm": 6.328319072723389, + "learning_rate": 1.6220489619216988e-06, + "loss": 0.305, + "num_input_tokens_seen": 16884416, + "step": 25865 + }, + { + "epoch": 15.253537735849056, + "grad_norm": 3.5702531337738037, + "learning_rate": 1.6201526053618e-06, + "loss": 0.3347, + "num_input_tokens_seen": 16890112, + "step": 25870 + }, + { + "epoch": 15.256485849056604, + "grad_norm": 2.707454204559326, + "learning_rate": 1.6182571436310634e-06, + "loss": 0.2463, + "num_input_tokens_seen": 16893792, + "step": 25875 + }, + { + "epoch": 15.25943396226415, + "grad_norm": 2.448408365249634, + "learning_rate": 1.616362577231324e-06, + "loss": 0.2625, + "num_input_tokens_seen": 16897376, + "step": 25880 + }, + { + "epoch": 15.262382075471699, + "grad_norm": 3.3578197956085205, + "learning_rate": 1.614468906664175e-06, + "loss": 0.2669, + "num_input_tokens_seen": 16899968, + "step": 25885 + }, + { + "epoch": 15.265330188679245, + "grad_norm": 2.9495012760162354, + "learning_rate": 1.612576132430974e-06, + "loss": 0.4096, + "num_input_tokens_seen": 16902944, + "step": 25890 + }, + { + "epoch": 15.268278301886792, + "grad_norm": 6.448451995849609, + "learning_rate": 1.6106842550328406e-06, + "loss": 0.2346, + "num_input_tokens_seen": 16905664, + "step": 25895 + }, + { + "epoch": 15.27122641509434, + "grad_norm": 2.8032243251800537, + "learning_rate": 1.6087932749706582e-06, + "loss": 0.2025, + "num_input_tokens_seen": 16912096, + "step": 25900 + }, + { + "epoch": 15.274174528301886, + "grad_norm": 3.229433059692383, + "learning_rate": 1.6069031927450696e-06, + "loss": 0.2872, + "num_input_tokens_seen": 16915616, + "step": 25905 + }, + { + "epoch": 15.277122641509434, + "grad_norm": 2.603994607925415, + "learning_rate": 1.605014008856486e-06, + "loss": 0.2822, + "num_input_tokens_seen": 16919008, + "step": 25910 + }, + { + "epoch": 15.28007075471698, + "grad_norm": 3.950531005859375, + "learning_rate": 1.6031257238050745e-06, + "loss": 0.4182, + "num_input_tokens_seen": 16921568, + "step": 25915 + }, + { + "epoch": 15.283018867924529, + "grad_norm": 2.3392112255096436, + "learning_rate": 1.601238338090768e-06, + "loss": 0.238, + "num_input_tokens_seen": 16924128, + "step": 25920 + }, + { + "epoch": 15.285966981132075, + "grad_norm": 2.9155807495117188, + "learning_rate": 1.5993518522132595e-06, + "loss": 0.2561, + "num_input_tokens_seen": 16927776, + "step": 25925 + }, + { + "epoch": 15.288915094339623, + "grad_norm": 4.762979030609131, + "learning_rate": 1.5974662666720037e-06, + "loss": 0.3429, + "num_input_tokens_seen": 16930304, + "step": 25930 + }, + { + "epoch": 15.29186320754717, + "grad_norm": 3.616849184036255, + "learning_rate": 1.5955815819662162e-06, + "loss": 0.2773, + "num_input_tokens_seen": 16933920, + "step": 25935 + }, + { + "epoch": 15.294811320754716, + "grad_norm": 1.9612195491790771, + "learning_rate": 1.5936977985948788e-06, + "loss": 0.2703, + "num_input_tokens_seen": 16937280, + "step": 25940 + }, + { + "epoch": 15.297759433962264, + "grad_norm": 5.351400852203369, + "learning_rate": 1.5918149170567298e-06, + "loss": 0.3939, + "num_input_tokens_seen": 16939648, + "step": 25945 + }, + { + "epoch": 15.30070754716981, + "grad_norm": 3.1368401050567627, + "learning_rate": 1.5899329378502698e-06, + "loss": 0.2569, + "num_input_tokens_seen": 16943296, + "step": 25950 + }, + { + "epoch": 15.303655660377359, + "grad_norm": 2.486192226409912, + "learning_rate": 1.588051861473761e-06, + "loss": 0.2147, + "num_input_tokens_seen": 16946016, + "step": 25955 + }, + { + "epoch": 15.306603773584905, + "grad_norm": 5.314333915710449, + "learning_rate": 1.5861716884252253e-06, + "loss": 0.3417, + "num_input_tokens_seen": 16949248, + "step": 25960 + }, + { + "epoch": 15.309551886792454, + "grad_norm": 4.597508430480957, + "learning_rate": 1.5842924192024489e-06, + "loss": 0.2476, + "num_input_tokens_seen": 16951968, + "step": 25965 + }, + { + "epoch": 15.3125, + "grad_norm": 3.496854305267334, + "learning_rate": 1.5824140543029742e-06, + "loss": 0.431, + "num_input_tokens_seen": 16955360, + "step": 25970 + }, + { + "epoch": 15.315448113207546, + "grad_norm": 4.190262317657471, + "learning_rate": 1.5805365942241092e-06, + "loss": 0.3109, + "num_input_tokens_seen": 16959040, + "step": 25975 + }, + { + "epoch": 15.318396226415095, + "grad_norm": 3.240661859512329, + "learning_rate": 1.5786600394629181e-06, + "loss": 0.4035, + "num_input_tokens_seen": 16963456, + "step": 25980 + }, + { + "epoch": 15.321344339622641, + "grad_norm": 3.06575345993042, + "learning_rate": 1.5767843905162261e-06, + "loss": 0.315, + "num_input_tokens_seen": 16966528, + "step": 25985 + }, + { + "epoch": 15.32429245283019, + "grad_norm": 3.030884027481079, + "learning_rate": 1.5749096478806209e-06, + "loss": 0.3156, + "num_input_tokens_seen": 16969824, + "step": 25990 + }, + { + "epoch": 15.327240566037736, + "grad_norm": 2.5016205310821533, + "learning_rate": 1.5730358120524452e-06, + "loss": 0.2571, + "num_input_tokens_seen": 16973536, + "step": 25995 + }, + { + "epoch": 15.330188679245284, + "grad_norm": 3.1649436950683594, + "learning_rate": 1.5711628835278098e-06, + "loss": 0.278, + "num_input_tokens_seen": 16975808, + "step": 26000 + }, + { + "epoch": 15.33313679245283, + "grad_norm": 2.832500696182251, + "learning_rate": 1.5692908628025782e-06, + "loss": 0.2894, + "num_input_tokens_seen": 16979200, + "step": 26005 + }, + { + "epoch": 15.336084905660377, + "grad_norm": 3.2382915019989014, + "learning_rate": 1.5674197503723765e-06, + "loss": 0.3037, + "num_input_tokens_seen": 16982048, + "step": 26010 + }, + { + "epoch": 15.339033018867925, + "grad_norm": 4.457133769989014, + "learning_rate": 1.5655495467325893e-06, + "loss": 0.2992, + "num_input_tokens_seen": 16984608, + "step": 26015 + }, + { + "epoch": 15.341981132075471, + "grad_norm": 4.746394157409668, + "learning_rate": 1.5636802523783613e-06, + "loss": 0.2037, + "num_input_tokens_seen": 16987456, + "step": 26020 + }, + { + "epoch": 15.34492924528302, + "grad_norm": 3.240077018737793, + "learning_rate": 1.5618118678045947e-06, + "loss": 0.3017, + "num_input_tokens_seen": 16990048, + "step": 26025 + }, + { + "epoch": 15.347877358490566, + "grad_norm": 3.805908441543579, + "learning_rate": 1.5599443935059549e-06, + "loss": 0.3415, + "num_input_tokens_seen": 16993024, + "step": 26030 + }, + { + "epoch": 15.350825471698114, + "grad_norm": 3.686028242111206, + "learning_rate": 1.5580778299768635e-06, + "loss": 0.2772, + "num_input_tokens_seen": 16995552, + "step": 26035 + }, + { + "epoch": 15.35377358490566, + "grad_norm": 4.051374912261963, + "learning_rate": 1.5562121777114997e-06, + "loss": 0.3273, + "num_input_tokens_seen": 16998848, + "step": 26040 + }, + { + "epoch": 15.356721698113208, + "grad_norm": 3.596458911895752, + "learning_rate": 1.5543474372038043e-06, + "loss": 0.3127, + "num_input_tokens_seen": 17001792, + "step": 26045 + }, + { + "epoch": 15.359669811320755, + "grad_norm": 4.27158784866333, + "learning_rate": 1.5524836089474748e-06, + "loss": 0.3338, + "num_input_tokens_seen": 17004576, + "step": 26050 + }, + { + "epoch": 15.362617924528301, + "grad_norm": 3.6558728218078613, + "learning_rate": 1.5506206934359664e-06, + "loss": 0.3955, + "num_input_tokens_seen": 17008736, + "step": 26055 + }, + { + "epoch": 15.36556603773585, + "grad_norm": 4.7534332275390625, + "learning_rate": 1.5487586911624947e-06, + "loss": 0.2797, + "num_input_tokens_seen": 17012128, + "step": 26060 + }, + { + "epoch": 15.368514150943396, + "grad_norm": 6.403425693511963, + "learning_rate": 1.5468976026200355e-06, + "loss": 0.3364, + "num_input_tokens_seen": 17015584, + "step": 26065 + }, + { + "epoch": 15.371462264150944, + "grad_norm": 4.322321891784668, + "learning_rate": 1.5450374283013187e-06, + "loss": 0.3649, + "num_input_tokens_seen": 17019456, + "step": 26070 + }, + { + "epoch": 15.37441037735849, + "grad_norm": 3.8460400104522705, + "learning_rate": 1.5431781686988317e-06, + "loss": 0.2437, + "num_input_tokens_seen": 17022272, + "step": 26075 + }, + { + "epoch": 15.377358490566039, + "grad_norm": 3.451488494873047, + "learning_rate": 1.5413198243048233e-06, + "loss": 0.3765, + "num_input_tokens_seen": 17025856, + "step": 26080 + }, + { + "epoch": 15.380306603773585, + "grad_norm": 3.2928130626678467, + "learning_rate": 1.5394623956112974e-06, + "loss": 0.3269, + "num_input_tokens_seen": 17029024, + "step": 26085 + }, + { + "epoch": 15.383254716981131, + "grad_norm": 3.556833505630493, + "learning_rate": 1.537605883110015e-06, + "loss": 0.4038, + "num_input_tokens_seen": 17033056, + "step": 26090 + }, + { + "epoch": 15.38620283018868, + "grad_norm": 4.7401814460754395, + "learning_rate": 1.5357502872924984e-06, + "loss": 0.361, + "num_input_tokens_seen": 17035488, + "step": 26095 + }, + { + "epoch": 15.389150943396226, + "grad_norm": 3.3929638862609863, + "learning_rate": 1.5338956086500235e-06, + "loss": 0.2865, + "num_input_tokens_seen": 17039296, + "step": 26100 + }, + { + "epoch": 15.392099056603774, + "grad_norm": 3.584487199783325, + "learning_rate": 1.5320418476736237e-06, + "loss": 0.3838, + "num_input_tokens_seen": 17043104, + "step": 26105 + }, + { + "epoch": 15.39504716981132, + "grad_norm": 4.780518054962158, + "learning_rate": 1.5301890048540912e-06, + "loss": 0.2337, + "num_input_tokens_seen": 17046080, + "step": 26110 + }, + { + "epoch": 15.397995283018869, + "grad_norm": 4.179786205291748, + "learning_rate": 1.5283370806819743e-06, + "loss": 0.3338, + "num_input_tokens_seen": 17048768, + "step": 26115 + }, + { + "epoch": 15.400943396226415, + "grad_norm": 3.493394374847412, + "learning_rate": 1.5264860756475752e-06, + "loss": 0.2568, + "num_input_tokens_seen": 17051680, + "step": 26120 + }, + { + "epoch": 15.403891509433961, + "grad_norm": 2.6219229698181152, + "learning_rate": 1.5246359902409592e-06, + "loss": 0.2933, + "num_input_tokens_seen": 17054944, + "step": 26125 + }, + { + "epoch": 15.40683962264151, + "grad_norm": 2.644037961959839, + "learning_rate": 1.5227868249519423e-06, + "loss": 0.3394, + "num_input_tokens_seen": 17058048, + "step": 26130 + }, + { + "epoch": 15.409787735849056, + "grad_norm": 2.4150314331054688, + "learning_rate": 1.5209385802700999e-06, + "loss": 0.2944, + "num_input_tokens_seen": 17060960, + "step": 26135 + }, + { + "epoch": 15.412735849056604, + "grad_norm": 2.8627138137817383, + "learning_rate": 1.5190912566847626e-06, + "loss": 0.2966, + "num_input_tokens_seen": 17064992, + "step": 26140 + }, + { + "epoch": 15.41568396226415, + "grad_norm": 5.699700832366943, + "learning_rate": 1.5172448546850166e-06, + "loss": 0.2344, + "num_input_tokens_seen": 17067936, + "step": 26145 + }, + { + "epoch": 15.418632075471699, + "grad_norm": 3.5310146808624268, + "learning_rate": 1.515399374759704e-06, + "loss": 0.2554, + "num_input_tokens_seen": 17071168, + "step": 26150 + }, + { + "epoch": 15.421580188679245, + "grad_norm": 5.196860313415527, + "learning_rate": 1.513554817397424e-06, + "loss": 0.282, + "num_input_tokens_seen": 17074560, + "step": 26155 + }, + { + "epoch": 15.424528301886792, + "grad_norm": 3.084752321243286, + "learning_rate": 1.5117111830865338e-06, + "loss": 0.3104, + "num_input_tokens_seen": 17077312, + "step": 26160 + }, + { + "epoch": 15.42747641509434, + "grad_norm": 3.71352481842041, + "learning_rate": 1.509868472315142e-06, + "loss": 0.2123, + "num_input_tokens_seen": 17080608, + "step": 26165 + }, + { + "epoch": 15.430424528301886, + "grad_norm": 2.066924810409546, + "learning_rate": 1.508026685571113e-06, + "loss": 0.2791, + "num_input_tokens_seen": 17083072, + "step": 26170 + }, + { + "epoch": 15.433372641509434, + "grad_norm": 3.460465669631958, + "learning_rate": 1.506185823342069e-06, + "loss": 0.2194, + "num_input_tokens_seen": 17086208, + "step": 26175 + }, + { + "epoch": 15.43632075471698, + "grad_norm": 2.629861831665039, + "learning_rate": 1.504345886115386e-06, + "loss": 0.3421, + "num_input_tokens_seen": 17089312, + "step": 26180 + }, + { + "epoch": 15.439268867924529, + "grad_norm": 3.4712040424346924, + "learning_rate": 1.502506874378193e-06, + "loss": 0.3095, + "num_input_tokens_seen": 17092576, + "step": 26185 + }, + { + "epoch": 15.442216981132075, + "grad_norm": 4.7518463134765625, + "learning_rate": 1.5006687886173805e-06, + "loss": 0.4259, + "num_input_tokens_seen": 17096896, + "step": 26190 + }, + { + "epoch": 15.445165094339623, + "grad_norm": 2.8150784969329834, + "learning_rate": 1.498831629319587e-06, + "loss": 0.2983, + "num_input_tokens_seen": 17100320, + "step": 26195 + }, + { + "epoch": 15.44811320754717, + "grad_norm": 3.150428533554077, + "learning_rate": 1.4969953969712087e-06, + "loss": 0.3975, + "num_input_tokens_seen": 17103200, + "step": 26200 + }, + { + "epoch": 15.451061320754716, + "grad_norm": 6.3504252433776855, + "learning_rate": 1.4951600920583963e-06, + "loss": 0.3394, + "num_input_tokens_seen": 17106336, + "step": 26205 + }, + { + "epoch": 15.454009433962264, + "grad_norm": 3.351583242416382, + "learning_rate": 1.493325715067055e-06, + "loss": 0.4025, + "num_input_tokens_seen": 17109728, + "step": 26210 + }, + { + "epoch": 15.45695754716981, + "grad_norm": 4.036856651306152, + "learning_rate": 1.4914922664828417e-06, + "loss": 0.3181, + "num_input_tokens_seen": 17113152, + "step": 26215 + }, + { + "epoch": 15.459905660377359, + "grad_norm": 4.360968589782715, + "learning_rate": 1.4896597467911732e-06, + "loss": 0.342, + "num_input_tokens_seen": 17115904, + "step": 26220 + }, + { + "epoch": 15.462853773584905, + "grad_norm": 2.939380645751953, + "learning_rate": 1.4878281564772156e-06, + "loss": 0.2785, + "num_input_tokens_seen": 17118752, + "step": 26225 + }, + { + "epoch": 15.465801886792454, + "grad_norm": 5.1072211265563965, + "learning_rate": 1.4859974960258898e-06, + "loss": 0.2652, + "num_input_tokens_seen": 17121888, + "step": 26230 + }, + { + "epoch": 15.46875, + "grad_norm": 3.8873965740203857, + "learning_rate": 1.4841677659218723e-06, + "loss": 0.349, + "num_input_tokens_seen": 17124320, + "step": 26235 + }, + { + "epoch": 15.471698113207546, + "grad_norm": 4.178337097167969, + "learning_rate": 1.4823389666495886e-06, + "loss": 0.3724, + "num_input_tokens_seen": 17128480, + "step": 26240 + }, + { + "epoch": 15.474646226415095, + "grad_norm": 4.621440887451172, + "learning_rate": 1.4805110986932258e-06, + "loss": 0.3636, + "num_input_tokens_seen": 17132160, + "step": 26245 + }, + { + "epoch": 15.477594339622641, + "grad_norm": 3.8071951866149902, + "learning_rate": 1.4786841625367166e-06, + "loss": 0.2759, + "num_input_tokens_seen": 17135392, + "step": 26250 + }, + { + "epoch": 15.48054245283019, + "grad_norm": 3.615647792816162, + "learning_rate": 1.476858158663752e-06, + "loss": 0.3144, + "num_input_tokens_seen": 17139232, + "step": 26255 + }, + { + "epoch": 15.483490566037736, + "grad_norm": 3.2508201599121094, + "learning_rate": 1.4750330875577745e-06, + "loss": 0.3538, + "num_input_tokens_seen": 17142976, + "step": 26260 + }, + { + "epoch": 15.486438679245284, + "grad_norm": 4.152102947235107, + "learning_rate": 1.4732089497019787e-06, + "loss": 0.2685, + "num_input_tokens_seen": 17145824, + "step": 26265 + }, + { + "epoch": 15.48938679245283, + "grad_norm": 2.2849650382995605, + "learning_rate": 1.471385745579313e-06, + "loss": 0.2811, + "num_input_tokens_seen": 17148320, + "step": 26270 + }, + { + "epoch": 15.492334905660377, + "grad_norm": 2.4169235229492188, + "learning_rate": 1.4695634756724775e-06, + "loss": 0.2491, + "num_input_tokens_seen": 17151584, + "step": 26275 + }, + { + "epoch": 15.495283018867925, + "grad_norm": 3.553770065307617, + "learning_rate": 1.4677421404639281e-06, + "loss": 0.3467, + "num_input_tokens_seen": 17154496, + "step": 26280 + }, + { + "epoch": 15.498231132075471, + "grad_norm": 4.313138484954834, + "learning_rate": 1.4659217404358706e-06, + "loss": 0.313, + "num_input_tokens_seen": 17157376, + "step": 26285 + }, + { + "epoch": 15.50117924528302, + "grad_norm": 3.95273494720459, + "learning_rate": 1.4641022760702627e-06, + "loss": 0.4681, + "num_input_tokens_seen": 17161792, + "step": 26290 + }, + { + "epoch": 15.504127358490566, + "grad_norm": 4.109541893005371, + "learning_rate": 1.4622837478488172e-06, + "loss": 0.2548, + "num_input_tokens_seen": 17164992, + "step": 26295 + }, + { + "epoch": 15.507075471698114, + "grad_norm": 3.1863913536071777, + "learning_rate": 1.4604661562529953e-06, + "loss": 0.422, + "num_input_tokens_seen": 17168672, + "step": 26300 + }, + { + "epoch": 15.51002358490566, + "grad_norm": 2.9035253524780273, + "learning_rate": 1.4586495017640119e-06, + "loss": 0.3129, + "num_input_tokens_seen": 17171232, + "step": 26305 + }, + { + "epoch": 15.512971698113208, + "grad_norm": 5.44696044921875, + "learning_rate": 1.4568337848628366e-06, + "loss": 0.3671, + "num_input_tokens_seen": 17174080, + "step": 26310 + }, + { + "epoch": 15.515919811320755, + "grad_norm": 4.756264686584473, + "learning_rate": 1.4550190060301872e-06, + "loss": 0.271, + "num_input_tokens_seen": 17177728, + "step": 26315 + }, + { + "epoch": 15.518867924528301, + "grad_norm": 3.962956190109253, + "learning_rate": 1.4532051657465335e-06, + "loss": 0.2997, + "num_input_tokens_seen": 17180768, + "step": 26320 + }, + { + "epoch": 15.52181603773585, + "grad_norm": 4.2770795822143555, + "learning_rate": 1.4513922644920985e-06, + "loss": 0.3333, + "num_input_tokens_seen": 17183200, + "step": 26325 + }, + { + "epoch": 15.524764150943396, + "grad_norm": 4.2647809982299805, + "learning_rate": 1.4495803027468552e-06, + "loss": 0.2526, + "num_input_tokens_seen": 17185888, + "step": 26330 + }, + { + "epoch": 15.527712264150944, + "grad_norm": 3.7605979442596436, + "learning_rate": 1.4477692809905263e-06, + "loss": 0.2904, + "num_input_tokens_seen": 17188960, + "step": 26335 + }, + { + "epoch": 15.53066037735849, + "grad_norm": 3.663928508758545, + "learning_rate": 1.4459591997025896e-06, + "loss": 0.3754, + "num_input_tokens_seen": 17193024, + "step": 26340 + }, + { + "epoch": 15.533608490566039, + "grad_norm": 4.425746917724609, + "learning_rate": 1.4441500593622737e-06, + "loss": 0.3195, + "num_input_tokens_seen": 17196672, + "step": 26345 + }, + { + "epoch": 15.536556603773585, + "grad_norm": 5.2296857833862305, + "learning_rate": 1.4423418604485539e-06, + "loss": 0.3815, + "num_input_tokens_seen": 17200192, + "step": 26350 + }, + { + "epoch": 15.539504716981131, + "grad_norm": 3.1150588989257812, + "learning_rate": 1.4405346034401597e-06, + "loss": 0.3798, + "num_input_tokens_seen": 17203616, + "step": 26355 + }, + { + "epoch": 15.54245283018868, + "grad_norm": 3.5838539600372314, + "learning_rate": 1.4387282888155695e-06, + "loss": 0.2106, + "num_input_tokens_seen": 17205920, + "step": 26360 + }, + { + "epoch": 15.545400943396226, + "grad_norm": 3.058284282684326, + "learning_rate": 1.436922917053013e-06, + "loss": 0.3071, + "num_input_tokens_seen": 17209152, + "step": 26365 + }, + { + "epoch": 15.548349056603774, + "grad_norm": 4.9887542724609375, + "learning_rate": 1.4351184886304686e-06, + "loss": 0.3295, + "num_input_tokens_seen": 17213056, + "step": 26370 + }, + { + "epoch": 15.55129716981132, + "grad_norm": 4.2343926429748535, + "learning_rate": 1.4333150040256699e-06, + "loss": 0.346, + "num_input_tokens_seen": 17216032, + "step": 26375 + }, + { + "epoch": 15.554245283018869, + "grad_norm": 2.267852783203125, + "learning_rate": 1.4315124637160954e-06, + "loss": 0.2676, + "num_input_tokens_seen": 17220640, + "step": 26380 + }, + { + "epoch": 15.557193396226415, + "grad_norm": 3.4346415996551514, + "learning_rate": 1.4297108681789752e-06, + "loss": 0.3225, + "num_input_tokens_seen": 17223520, + "step": 26385 + }, + { + "epoch": 15.560141509433961, + "grad_norm": 6.319263458251953, + "learning_rate": 1.4279102178912902e-06, + "loss": 0.32, + "num_input_tokens_seen": 17226112, + "step": 26390 + }, + { + "epoch": 15.56308962264151, + "grad_norm": 2.5334126949310303, + "learning_rate": 1.4261105133297693e-06, + "loss": 0.4329, + "num_input_tokens_seen": 17229120, + "step": 26395 + }, + { + "epoch": 15.566037735849056, + "grad_norm": 2.146577835083008, + "learning_rate": 1.4243117549708913e-06, + "loss": 0.3536, + "num_input_tokens_seen": 17232096, + "step": 26400 + }, + { + "epoch": 15.568985849056604, + "grad_norm": 3.9969778060913086, + "learning_rate": 1.422513943290888e-06, + "loss": 0.2398, + "num_input_tokens_seen": 17235744, + "step": 26405 + }, + { + "epoch": 15.57193396226415, + "grad_norm": 3.6518540382385254, + "learning_rate": 1.4207170787657365e-06, + "loss": 0.3277, + "num_input_tokens_seen": 17238656, + "step": 26410 + }, + { + "epoch": 15.574882075471699, + "grad_norm": 3.3526155948638916, + "learning_rate": 1.4189211618711646e-06, + "loss": 0.3856, + "num_input_tokens_seen": 17241952, + "step": 26415 + }, + { + "epoch": 15.577830188679245, + "grad_norm": 3.049076557159424, + "learning_rate": 1.417126193082648e-06, + "loss": 0.2421, + "num_input_tokens_seen": 17245152, + "step": 26420 + }, + { + "epoch": 15.580778301886792, + "grad_norm": 3.4857053756713867, + "learning_rate": 1.4153321728754133e-06, + "loss": 0.2122, + "num_input_tokens_seen": 17248416, + "step": 26425 + }, + { + "epoch": 15.58372641509434, + "grad_norm": 4.722821235656738, + "learning_rate": 1.4135391017244338e-06, + "loss": 0.3111, + "num_input_tokens_seen": 17251648, + "step": 26430 + }, + { + "epoch": 15.586674528301886, + "grad_norm": 3.9381651878356934, + "learning_rate": 1.4117469801044332e-06, + "loss": 0.3405, + "num_input_tokens_seen": 17255744, + "step": 26435 + }, + { + "epoch": 15.589622641509434, + "grad_norm": 4.540798664093018, + "learning_rate": 1.4099558084898862e-06, + "loss": 0.3773, + "num_input_tokens_seen": 17259040, + "step": 26440 + }, + { + "epoch": 15.59257075471698, + "grad_norm": 2.6230900287628174, + "learning_rate": 1.408165587355011e-06, + "loss": 0.2728, + "num_input_tokens_seen": 17262784, + "step": 26445 + }, + { + "epoch": 15.595518867924529, + "grad_norm": 2.424760341644287, + "learning_rate": 1.4063763171737766e-06, + "loss": 0.3395, + "num_input_tokens_seen": 17265504, + "step": 26450 + }, + { + "epoch": 15.598466981132075, + "grad_norm": 2.973039388656616, + "learning_rate": 1.4045879984198996e-06, + "loss": 0.2791, + "num_input_tokens_seen": 17269440, + "step": 26455 + }, + { + "epoch": 15.601415094339622, + "grad_norm": 1.8335871696472168, + "learning_rate": 1.4028006315668457e-06, + "loss": 0.3203, + "num_input_tokens_seen": 17272480, + "step": 26460 + }, + { + "epoch": 15.60436320754717, + "grad_norm": 10.609156608581543, + "learning_rate": 1.4010142170878261e-06, + "loss": 0.4486, + "num_input_tokens_seen": 17275808, + "step": 26465 + }, + { + "epoch": 15.607311320754716, + "grad_norm": 3.8790392875671387, + "learning_rate": 1.3992287554558042e-06, + "loss": 0.3875, + "num_input_tokens_seen": 17279424, + "step": 26470 + }, + { + "epoch": 15.610259433962264, + "grad_norm": 2.501727342605591, + "learning_rate": 1.3974442471434885e-06, + "loss": 0.268, + "num_input_tokens_seen": 17282080, + "step": 26475 + }, + { + "epoch": 15.61320754716981, + "grad_norm": 5.29008150100708, + "learning_rate": 1.395660692623334e-06, + "loss": 0.3232, + "num_input_tokens_seen": 17284608, + "step": 26480 + }, + { + "epoch": 15.616155660377359, + "grad_norm": 3.5645017623901367, + "learning_rate": 1.3938780923675454e-06, + "loss": 0.312, + "num_input_tokens_seen": 17287328, + "step": 26485 + }, + { + "epoch": 15.619103773584905, + "grad_norm": 5.405980587005615, + "learning_rate": 1.3920964468480718e-06, + "loss": 0.3907, + "num_input_tokens_seen": 17289856, + "step": 26490 + }, + { + "epoch": 15.622051886792454, + "grad_norm": 4.0511884689331055, + "learning_rate": 1.3903157565366143e-06, + "loss": 0.2877, + "num_input_tokens_seen": 17292736, + "step": 26495 + }, + { + "epoch": 15.625, + "grad_norm": 3.301304340362549, + "learning_rate": 1.3885360219046172e-06, + "loss": 0.3608, + "num_input_tokens_seen": 17295968, + "step": 26500 + }, + { + "epoch": 15.627948113207546, + "grad_norm": 3.1807429790496826, + "learning_rate": 1.386757243423273e-06, + "loss": 0.3303, + "num_input_tokens_seen": 17298144, + "step": 26505 + }, + { + "epoch": 15.630896226415095, + "grad_norm": 3.662149429321289, + "learning_rate": 1.384979421563521e-06, + "loss": 0.3568, + "num_input_tokens_seen": 17301184, + "step": 26510 + }, + { + "epoch": 15.633844339622641, + "grad_norm": 5.271115303039551, + "learning_rate": 1.3832025567960465e-06, + "loss": 0.4353, + "num_input_tokens_seen": 17303808, + "step": 26515 + }, + { + "epoch": 15.63679245283019, + "grad_norm": 2.084275245666504, + "learning_rate": 1.3814266495912815e-06, + "loss": 0.3105, + "num_input_tokens_seen": 17306784, + "step": 26520 + }, + { + "epoch": 15.639740566037736, + "grad_norm": 3.6416585445404053, + "learning_rate": 1.3796517004194078e-06, + "loss": 0.3354, + "num_input_tokens_seen": 17310432, + "step": 26525 + }, + { + "epoch": 15.642688679245284, + "grad_norm": 4.203418731689453, + "learning_rate": 1.3778777097503476e-06, + "loss": 0.3413, + "num_input_tokens_seen": 17313152, + "step": 26530 + }, + { + "epoch": 15.64563679245283, + "grad_norm": 2.231419086456299, + "learning_rate": 1.3761046780537757e-06, + "loss": 0.2968, + "num_input_tokens_seen": 17315680, + "step": 26535 + }, + { + "epoch": 15.648584905660378, + "grad_norm": 2.2358946800231934, + "learning_rate": 1.3743326057991086e-06, + "loss": 0.1888, + "num_input_tokens_seen": 17318880, + "step": 26540 + }, + { + "epoch": 15.651533018867925, + "grad_norm": 3.609388828277588, + "learning_rate": 1.3725614934555093e-06, + "loss": 0.3633, + "num_input_tokens_seen": 17321728, + "step": 26545 + }, + { + "epoch": 15.654481132075471, + "grad_norm": 3.898571252822876, + "learning_rate": 1.3707913414918882e-06, + "loss": 0.2854, + "num_input_tokens_seen": 17324896, + "step": 26550 + }, + { + "epoch": 15.65742924528302, + "grad_norm": 3.0724971294403076, + "learning_rate": 1.3690221503768996e-06, + "loss": 0.2834, + "num_input_tokens_seen": 17328448, + "step": 26555 + }, + { + "epoch": 15.660377358490566, + "grad_norm": 3.8443989753723145, + "learning_rate": 1.3672539205789465e-06, + "loss": 0.4084, + "num_input_tokens_seen": 17332224, + "step": 26560 + }, + { + "epoch": 15.663325471698114, + "grad_norm": 4.084451198577881, + "learning_rate": 1.3654866525661737e-06, + "loss": 0.3564, + "num_input_tokens_seen": 17334624, + "step": 26565 + }, + { + "epoch": 15.66627358490566, + "grad_norm": 5.078795433044434, + "learning_rate": 1.3637203468064741e-06, + "loss": 0.4208, + "num_input_tokens_seen": 17339936, + "step": 26570 + }, + { + "epoch": 15.669221698113208, + "grad_norm": 7.502529621124268, + "learning_rate": 1.3619550037674838e-06, + "loss": 0.3612, + "num_input_tokens_seen": 17342720, + "step": 26575 + }, + { + "epoch": 15.672169811320755, + "grad_norm": 5.799961090087891, + "learning_rate": 1.3601906239165857e-06, + "loss": 0.3434, + "num_input_tokens_seen": 17346464, + "step": 26580 + }, + { + "epoch": 15.675117924528301, + "grad_norm": 3.802048444747925, + "learning_rate": 1.3584272077209048e-06, + "loss": 0.4204, + "num_input_tokens_seen": 17349600, + "step": 26585 + }, + { + "epoch": 15.67806603773585, + "grad_norm": 3.458008050918579, + "learning_rate": 1.3566647556473168e-06, + "loss": 0.2809, + "num_input_tokens_seen": 17352512, + "step": 26590 + }, + { + "epoch": 15.681014150943396, + "grad_norm": 3.2562687397003174, + "learning_rate": 1.3549032681624363e-06, + "loss": 0.3846, + "num_input_tokens_seen": 17356896, + "step": 26595 + }, + { + "epoch": 15.683962264150944, + "grad_norm": 3.19452166557312, + "learning_rate": 1.3531427457326252e-06, + "loss": 0.2189, + "num_input_tokens_seen": 17360160, + "step": 26600 + }, + { + "epoch": 15.68691037735849, + "grad_norm": 3.9227116107940674, + "learning_rate": 1.3513831888239893e-06, + "loss": 0.3953, + "num_input_tokens_seen": 17363488, + "step": 26605 + }, + { + "epoch": 15.689858490566039, + "grad_norm": 3.5639758110046387, + "learning_rate": 1.3496245979023786e-06, + "loss": 0.2436, + "num_input_tokens_seen": 17367296, + "step": 26610 + }, + { + "epoch": 15.692806603773585, + "grad_norm": 3.6830461025238037, + "learning_rate": 1.3478669734333865e-06, + "loss": 0.3612, + "num_input_tokens_seen": 17371008, + "step": 26615 + }, + { + "epoch": 15.695754716981131, + "grad_norm": 3.814260721206665, + "learning_rate": 1.3461103158823546e-06, + "loss": 0.3699, + "num_input_tokens_seen": 17374304, + "step": 26620 + }, + { + "epoch": 15.69870283018868, + "grad_norm": 4.493785381317139, + "learning_rate": 1.3443546257143624e-06, + "loss": 0.2904, + "num_input_tokens_seen": 17377344, + "step": 26625 + }, + { + "epoch": 15.701650943396226, + "grad_norm": 3.8498027324676514, + "learning_rate": 1.3425999033942395e-06, + "loss": 0.3278, + "num_input_tokens_seen": 17380032, + "step": 26630 + }, + { + "epoch": 15.704599056603774, + "grad_norm": 2.684558153152466, + "learning_rate": 1.3408461493865549e-06, + "loss": 0.3639, + "num_input_tokens_seen": 17382336, + "step": 26635 + }, + { + "epoch": 15.70754716981132, + "grad_norm": 4.0287394523620605, + "learning_rate": 1.339093364155622e-06, + "loss": 0.4187, + "num_input_tokens_seen": 17385664, + "step": 26640 + }, + { + "epoch": 15.710495283018869, + "grad_norm": 3.1829121112823486, + "learning_rate": 1.3373415481654988e-06, + "loss": 0.4071, + "num_input_tokens_seen": 17389056, + "step": 26645 + }, + { + "epoch": 15.713443396226415, + "grad_norm": 3.048290729522705, + "learning_rate": 1.335590701879984e-06, + "loss": 0.3444, + "num_input_tokens_seen": 17391936, + "step": 26650 + }, + { + "epoch": 15.716391509433961, + "grad_norm": 1.9633924961090088, + "learning_rate": 1.3338408257626257e-06, + "loss": 0.2407, + "num_input_tokens_seen": 17396832, + "step": 26655 + }, + { + "epoch": 15.71933962264151, + "grad_norm": 8.257603645324707, + "learning_rate": 1.3320919202767086e-06, + "loss": 0.2513, + "num_input_tokens_seen": 17399232, + "step": 26660 + }, + { + "epoch": 15.722287735849056, + "grad_norm": 9.309518814086914, + "learning_rate": 1.3303439858852636e-06, + "loss": 0.3838, + "num_input_tokens_seen": 17402624, + "step": 26665 + }, + { + "epoch": 15.725235849056604, + "grad_norm": 3.427581548690796, + "learning_rate": 1.3285970230510636e-06, + "loss": 0.3655, + "num_input_tokens_seen": 17406016, + "step": 26670 + }, + { + "epoch": 15.72818396226415, + "grad_norm": 2.9810760021209717, + "learning_rate": 1.3268510322366246e-06, + "loss": 0.3531, + "num_input_tokens_seen": 17408928, + "step": 26675 + }, + { + "epoch": 15.731132075471699, + "grad_norm": 3.1264753341674805, + "learning_rate": 1.3251060139042038e-06, + "loss": 0.2431, + "num_input_tokens_seen": 17412096, + "step": 26680 + }, + { + "epoch": 15.734080188679245, + "grad_norm": 4.51759147644043, + "learning_rate": 1.3233619685158056e-06, + "loss": 0.3417, + "num_input_tokens_seen": 17414816, + "step": 26685 + }, + { + "epoch": 15.737028301886792, + "grad_norm": 7.457120418548584, + "learning_rate": 1.3216188965331712e-06, + "loss": 0.3285, + "num_input_tokens_seen": 17417376, + "step": 26690 + }, + { + "epoch": 15.73997641509434, + "grad_norm": 4.285046100616455, + "learning_rate": 1.3198767984177869e-06, + "loss": 0.3344, + "num_input_tokens_seen": 17420384, + "step": 26695 + }, + { + "epoch": 15.742924528301886, + "grad_norm": 3.7005157470703125, + "learning_rate": 1.3181356746308805e-06, + "loss": 0.3501, + "num_input_tokens_seen": 17423840, + "step": 26700 + }, + { + "epoch": 15.745872641509434, + "grad_norm": 3.1018455028533936, + "learning_rate": 1.3163955256334226e-06, + "loss": 0.2717, + "num_input_tokens_seen": 17426784, + "step": 26705 + }, + { + "epoch": 15.74882075471698, + "grad_norm": 4.256278038024902, + "learning_rate": 1.3146563518861227e-06, + "loss": 0.3016, + "num_input_tokens_seen": 17430080, + "step": 26710 + }, + { + "epoch": 15.751768867924529, + "grad_norm": 2.8724923133850098, + "learning_rate": 1.3129181538494384e-06, + "loss": 0.2567, + "num_input_tokens_seen": 17432768, + "step": 26715 + }, + { + "epoch": 15.754716981132075, + "grad_norm": 2.3154377937316895, + "learning_rate": 1.3111809319835622e-06, + "loss": 0.3205, + "num_input_tokens_seen": 17437504, + "step": 26720 + }, + { + "epoch": 15.757665094339622, + "grad_norm": 3.688124895095825, + "learning_rate": 1.3094446867484335e-06, + "loss": 0.3815, + "num_input_tokens_seen": 17440448, + "step": 26725 + }, + { + "epoch": 15.76061320754717, + "grad_norm": 3.2079412937164307, + "learning_rate": 1.3077094186037287e-06, + "loss": 0.4823, + "num_input_tokens_seen": 17443392, + "step": 26730 + }, + { + "epoch": 15.763561320754716, + "grad_norm": 4.019981384277344, + "learning_rate": 1.305975128008869e-06, + "loss": 0.4009, + "num_input_tokens_seen": 17445824, + "step": 26735 + }, + { + "epoch": 15.766509433962264, + "grad_norm": 6.259704113006592, + "learning_rate": 1.304241815423014e-06, + "loss": 0.3692, + "num_input_tokens_seen": 17448800, + "step": 26740 + }, + { + "epoch": 15.76945754716981, + "grad_norm": 5.5584893226623535, + "learning_rate": 1.3025094813050655e-06, + "loss": 0.3118, + "num_input_tokens_seen": 17451872, + "step": 26745 + }, + { + "epoch": 15.772405660377359, + "grad_norm": 2.5257208347320557, + "learning_rate": 1.3007781261136675e-06, + "loss": 0.35, + "num_input_tokens_seen": 17455584, + "step": 26750 + }, + { + "epoch": 15.775353773584905, + "grad_norm": 3.5317041873931885, + "learning_rate": 1.299047750307204e-06, + "loss": 0.3432, + "num_input_tokens_seen": 17459648, + "step": 26755 + }, + { + "epoch": 15.778301886792454, + "grad_norm": 3.3328137397766113, + "learning_rate": 1.297318354343799e-06, + "loss": 0.2918, + "num_input_tokens_seen": 17462528, + "step": 26760 + }, + { + "epoch": 15.78125, + "grad_norm": 4.417114734649658, + "learning_rate": 1.295589938681317e-06, + "loss": 0.3034, + "num_input_tokens_seen": 17466592, + "step": 26765 + }, + { + "epoch": 15.784198113207546, + "grad_norm": 3.058224678039551, + "learning_rate": 1.2938625037773628e-06, + "loss": 0.3418, + "num_input_tokens_seen": 17470240, + "step": 26770 + }, + { + "epoch": 15.787146226415095, + "grad_norm": 3.829861640930176, + "learning_rate": 1.2921360500892843e-06, + "loss": 0.2628, + "num_input_tokens_seen": 17474080, + "step": 26775 + }, + { + "epoch": 15.790094339622641, + "grad_norm": 3.8211669921875, + "learning_rate": 1.290410578074167e-06, + "loss": 0.3553, + "num_input_tokens_seen": 17476928, + "step": 26780 + }, + { + "epoch": 15.79304245283019, + "grad_norm": 4.273032188415527, + "learning_rate": 1.2886860881888362e-06, + "loss": 0.3007, + "num_input_tokens_seen": 17480256, + "step": 26785 + }, + { + "epoch": 15.795990566037736, + "grad_norm": 4.894825458526611, + "learning_rate": 1.2869625808898584e-06, + "loss": 0.2926, + "num_input_tokens_seen": 17483168, + "step": 26790 + }, + { + "epoch": 15.798938679245284, + "grad_norm": 12.426753997802734, + "learning_rate": 1.2852400566335398e-06, + "loss": 0.5848, + "num_input_tokens_seen": 17487360, + "step": 26795 + }, + { + "epoch": 15.80188679245283, + "grad_norm": 4.626605033874512, + "learning_rate": 1.2835185158759244e-06, + "loss": 0.3643, + "num_input_tokens_seen": 17491648, + "step": 26800 + }, + { + "epoch": 15.804834905660378, + "grad_norm": 3.019444227218628, + "learning_rate": 1.2817979590728009e-06, + "loss": 0.3023, + "num_input_tokens_seen": 17494464, + "step": 26805 + }, + { + "epoch": 15.807783018867925, + "grad_norm": 4.013226509094238, + "learning_rate": 1.2800783866796918e-06, + "loss": 0.3423, + "num_input_tokens_seen": 17497280, + "step": 26810 + }, + { + "epoch": 15.810731132075471, + "grad_norm": 5.741434097290039, + "learning_rate": 1.2783597991518604e-06, + "loss": 0.363, + "num_input_tokens_seen": 17500704, + "step": 26815 + }, + { + "epoch": 15.81367924528302, + "grad_norm": 5.237295150756836, + "learning_rate": 1.2766421969443131e-06, + "loss": 0.3457, + "num_input_tokens_seen": 17504192, + "step": 26820 + }, + { + "epoch": 15.816627358490566, + "grad_norm": 3.333575963973999, + "learning_rate": 1.274925580511791e-06, + "loss": 0.2919, + "num_input_tokens_seen": 17507808, + "step": 26825 + }, + { + "epoch": 15.819575471698114, + "grad_norm": 3.4029648303985596, + "learning_rate": 1.2732099503087757e-06, + "loss": 0.1826, + "num_input_tokens_seen": 17511488, + "step": 26830 + }, + { + "epoch": 15.82252358490566, + "grad_norm": 4.97998571395874, + "learning_rate": 1.2714953067894859e-06, + "loss": 0.2992, + "num_input_tokens_seen": 17514240, + "step": 26835 + }, + { + "epoch": 15.825471698113208, + "grad_norm": 4.448131561279297, + "learning_rate": 1.2697816504078847e-06, + "loss": 0.2914, + "num_input_tokens_seen": 17517024, + "step": 26840 + }, + { + "epoch": 15.828419811320755, + "grad_norm": 3.664102077484131, + "learning_rate": 1.2680689816176672e-06, + "loss": 0.4058, + "num_input_tokens_seen": 17519616, + "step": 26845 + }, + { + "epoch": 15.831367924528301, + "grad_norm": 4.474094390869141, + "learning_rate": 1.2663573008722707e-06, + "loss": 0.3124, + "num_input_tokens_seen": 17523072, + "step": 26850 + }, + { + "epoch": 15.83431603773585, + "grad_norm": 6.806681156158447, + "learning_rate": 1.2646466086248698e-06, + "loss": 0.2789, + "num_input_tokens_seen": 17525568, + "step": 26855 + }, + { + "epoch": 15.837264150943396, + "grad_norm": 4.433311462402344, + "learning_rate": 1.2629369053283779e-06, + "loss": 0.3958, + "num_input_tokens_seen": 17528480, + "step": 26860 + }, + { + "epoch": 15.840212264150944, + "grad_norm": 3.9505839347839355, + "learning_rate": 1.2612281914354452e-06, + "loss": 0.2362, + "num_input_tokens_seen": 17531168, + "step": 26865 + }, + { + "epoch": 15.84316037735849, + "grad_norm": 3.1731743812561035, + "learning_rate": 1.259520467398463e-06, + "loss": 0.3053, + "num_input_tokens_seen": 17534560, + "step": 26870 + }, + { + "epoch": 15.846108490566039, + "grad_norm": 2.6175055503845215, + "learning_rate": 1.2578137336695573e-06, + "loss": 0.2452, + "num_input_tokens_seen": 17537280, + "step": 26875 + }, + { + "epoch": 15.849056603773585, + "grad_norm": 6.345149517059326, + "learning_rate": 1.256107990700594e-06, + "loss": 0.3069, + "num_input_tokens_seen": 17540320, + "step": 26880 + }, + { + "epoch": 15.852004716981131, + "grad_norm": 5.569468975067139, + "learning_rate": 1.2544032389431753e-06, + "loss": 0.3686, + "num_input_tokens_seen": 17543264, + "step": 26885 + }, + { + "epoch": 15.85495283018868, + "grad_norm": 3.3612802028656006, + "learning_rate": 1.2526994788486418e-06, + "loss": 0.245, + "num_input_tokens_seen": 17546176, + "step": 26890 + }, + { + "epoch": 15.857900943396226, + "grad_norm": 2.4688563346862793, + "learning_rate": 1.2509967108680697e-06, + "loss": 0.3591, + "num_input_tokens_seen": 17549344, + "step": 26895 + }, + { + "epoch": 15.860849056603774, + "grad_norm": 4.200671195983887, + "learning_rate": 1.249294935452277e-06, + "loss": 0.3506, + "num_input_tokens_seen": 17553984, + "step": 26900 + }, + { + "epoch": 15.86379716981132, + "grad_norm": 0.5015525817871094, + "learning_rate": 1.247594153051815e-06, + "loss": 0.1985, + "num_input_tokens_seen": 17560768, + "step": 26905 + }, + { + "epoch": 15.866745283018869, + "grad_norm": 5.61729621887207, + "learning_rate": 1.2458943641169718e-06, + "loss": 0.3016, + "num_input_tokens_seen": 17563744, + "step": 26910 + }, + { + "epoch": 15.869693396226415, + "grad_norm": 4.284752368927002, + "learning_rate": 1.2441955690977758e-06, + "loss": 0.2823, + "num_input_tokens_seen": 17567168, + "step": 26915 + }, + { + "epoch": 15.872641509433961, + "grad_norm": 5.747034072875977, + "learning_rate": 1.2424977684439898e-06, + "loss": 0.3532, + "num_input_tokens_seen": 17570528, + "step": 26920 + }, + { + "epoch": 15.87558962264151, + "grad_norm": 2.929811954498291, + "learning_rate": 1.2408009626051137e-06, + "loss": 0.3773, + "num_input_tokens_seen": 17573440, + "step": 26925 + }, + { + "epoch": 15.878537735849056, + "grad_norm": 4.433734893798828, + "learning_rate": 1.2391051520303826e-06, + "loss": 0.2489, + "num_input_tokens_seen": 17576736, + "step": 26930 + }, + { + "epoch": 15.881485849056604, + "grad_norm": 3.778148889541626, + "learning_rate": 1.2374103371687723e-06, + "loss": 0.4045, + "num_input_tokens_seen": 17579936, + "step": 26935 + }, + { + "epoch": 15.88443396226415, + "grad_norm": 6.346294403076172, + "learning_rate": 1.2357165184689906e-06, + "loss": 0.2868, + "num_input_tokens_seen": 17583200, + "step": 26940 + }, + { + "epoch": 15.887382075471699, + "grad_norm": 4.476423263549805, + "learning_rate": 1.2340236963794845e-06, + "loss": 0.2296, + "num_input_tokens_seen": 17586144, + "step": 26945 + }, + { + "epoch": 15.890330188679245, + "grad_norm": 6.744287967681885, + "learning_rate": 1.232331871348435e-06, + "loss": 0.3663, + "num_input_tokens_seen": 17588992, + "step": 26950 + }, + { + "epoch": 15.893278301886792, + "grad_norm": 3.5366709232330322, + "learning_rate": 1.2306410438237603e-06, + "loss": 0.388, + "num_input_tokens_seen": 17591744, + "step": 26955 + }, + { + "epoch": 15.89622641509434, + "grad_norm": 2.818878173828125, + "learning_rate": 1.228951214253113e-06, + "loss": 0.3444, + "num_input_tokens_seen": 17595616, + "step": 26960 + }, + { + "epoch": 15.899174528301886, + "grad_norm": 6.603791236877441, + "learning_rate": 1.2272623830838854e-06, + "loss": 0.3375, + "num_input_tokens_seen": 17598880, + "step": 26965 + }, + { + "epoch": 15.902122641509434, + "grad_norm": 3.752480983734131, + "learning_rate": 1.2255745507632016e-06, + "loss": 0.2687, + "num_input_tokens_seen": 17602240, + "step": 26970 + }, + { + "epoch": 15.90507075471698, + "grad_norm": 3.635101795196533, + "learning_rate": 1.223887717737922e-06, + "loss": 0.2864, + "num_input_tokens_seen": 17605824, + "step": 26975 + }, + { + "epoch": 15.908018867924529, + "grad_norm": 3.5107717514038086, + "learning_rate": 1.2222018844546434e-06, + "loss": 0.245, + "num_input_tokens_seen": 17609248, + "step": 26980 + }, + { + "epoch": 15.910966981132075, + "grad_norm": 2.665670156478882, + "learning_rate": 1.2205170513596975e-06, + "loss": 0.3283, + "num_input_tokens_seen": 17612640, + "step": 26985 + }, + { + "epoch": 15.913915094339622, + "grad_norm": 4.795198917388916, + "learning_rate": 1.2188332188991493e-06, + "loss": 0.2401, + "num_input_tokens_seen": 17615680, + "step": 26990 + }, + { + "epoch": 15.91686320754717, + "grad_norm": 5.135300636291504, + "learning_rate": 1.217150387518804e-06, + "loss": 0.3587, + "num_input_tokens_seen": 17618656, + "step": 26995 + }, + { + "epoch": 15.919811320754716, + "grad_norm": 7.929037570953369, + "learning_rate": 1.2154685576641967e-06, + "loss": 0.4361, + "num_input_tokens_seen": 17621344, + "step": 27000 + }, + { + "epoch": 15.922759433962264, + "grad_norm": 3.451627731323242, + "learning_rate": 1.2137877297805972e-06, + "loss": 0.2664, + "num_input_tokens_seen": 17625088, + "step": 27005 + }, + { + "epoch": 15.92570754716981, + "grad_norm": 4.957344055175781, + "learning_rate": 1.2121079043130162e-06, + "loss": 0.3389, + "num_input_tokens_seen": 17632320, + "step": 27010 + }, + { + "epoch": 15.928655660377359, + "grad_norm": 4.476447582244873, + "learning_rate": 1.210429081706192e-06, + "loss": 0.3491, + "num_input_tokens_seen": 17635072, + "step": 27015 + }, + { + "epoch": 15.931603773584905, + "grad_norm": 4.012531280517578, + "learning_rate": 1.2087512624046005e-06, + "loss": 0.4026, + "num_input_tokens_seen": 17638304, + "step": 27020 + }, + { + "epoch": 15.934551886792454, + "grad_norm": 3.8610475063323975, + "learning_rate": 1.2070744468524503e-06, + "loss": 0.3374, + "num_input_tokens_seen": 17641216, + "step": 27025 + }, + { + "epoch": 15.9375, + "grad_norm": 4.519809246063232, + "learning_rate": 1.2053986354936887e-06, + "loss": 0.2618, + "num_input_tokens_seen": 17644576, + "step": 27030 + }, + { + "epoch": 15.940448113207546, + "grad_norm": 5.832067966461182, + "learning_rate": 1.2037238287719916e-06, + "loss": 0.283, + "num_input_tokens_seen": 17647648, + "step": 27035 + }, + { + "epoch": 15.943396226415095, + "grad_norm": 3.2916293144226074, + "learning_rate": 1.2020500271307721e-06, + "loss": 0.3916, + "num_input_tokens_seen": 17651040, + "step": 27040 + }, + { + "epoch": 15.946344339622641, + "grad_norm": 3.110333204269409, + "learning_rate": 1.200377231013176e-06, + "loss": 0.4676, + "num_input_tokens_seen": 17653728, + "step": 27045 + }, + { + "epoch": 15.94929245283019, + "grad_norm": 4.068283557891846, + "learning_rate": 1.1987054408620825e-06, + "loss": 0.4361, + "num_input_tokens_seen": 17656352, + "step": 27050 + }, + { + "epoch": 15.952240566037736, + "grad_norm": 3.59666109085083, + "learning_rate": 1.197034657120107e-06, + "loss": 0.3539, + "num_input_tokens_seen": 17662432, + "step": 27055 + }, + { + "epoch": 15.955188679245284, + "grad_norm": 2.68633770942688, + "learning_rate": 1.1953648802295964e-06, + "loss": 0.2473, + "num_input_tokens_seen": 17665888, + "step": 27060 + }, + { + "epoch": 15.95813679245283, + "grad_norm": 5.497695446014404, + "learning_rate": 1.1936961106326307e-06, + "loss": 0.278, + "num_input_tokens_seen": 17669056, + "step": 27065 + }, + { + "epoch": 15.961084905660378, + "grad_norm": 5.573581695556641, + "learning_rate": 1.1920283487710237e-06, + "loss": 0.256, + "num_input_tokens_seen": 17671200, + "step": 27070 + }, + { + "epoch": 15.964033018867925, + "grad_norm": 4.499593257904053, + "learning_rate": 1.1903615950863228e-06, + "loss": 0.2659, + "num_input_tokens_seen": 17673856, + "step": 27075 + }, + { + "epoch": 15.966981132075471, + "grad_norm": 2.468602418899536, + "learning_rate": 1.1886958500198076e-06, + "loss": 0.3612, + "num_input_tokens_seen": 17677280, + "step": 27080 + }, + { + "epoch": 15.96992924528302, + "grad_norm": 3.609217405319214, + "learning_rate": 1.1870311140124923e-06, + "loss": 0.3786, + "num_input_tokens_seen": 17680800, + "step": 27085 + }, + { + "epoch": 15.972877358490566, + "grad_norm": 17.841140747070312, + "learning_rate": 1.185367387505123e-06, + "loss": 0.3165, + "num_input_tokens_seen": 17683520, + "step": 27090 + }, + { + "epoch": 15.975825471698114, + "grad_norm": 3.465113639831543, + "learning_rate": 1.1837046709381783e-06, + "loss": 0.6599, + "num_input_tokens_seen": 17686272, + "step": 27095 + }, + { + "epoch": 15.97877358490566, + "grad_norm": 4.342942237854004, + "learning_rate": 1.1820429647518678e-06, + "loss": 0.2871, + "num_input_tokens_seen": 17689952, + "step": 27100 + }, + { + "epoch": 15.981721698113208, + "grad_norm": 4.398411273956299, + "learning_rate": 1.1803822693861377e-06, + "loss": 0.3556, + "num_input_tokens_seen": 17692672, + "step": 27105 + }, + { + "epoch": 15.984669811320755, + "grad_norm": 2.412160634994507, + "learning_rate": 1.1787225852806639e-06, + "loss": 0.2598, + "num_input_tokens_seen": 17695712, + "step": 27110 + }, + { + "epoch": 15.987617924528301, + "grad_norm": 3.93445086479187, + "learning_rate": 1.177063912874853e-06, + "loss": 0.3386, + "num_input_tokens_seen": 17698400, + "step": 27115 + }, + { + "epoch": 15.99056603773585, + "grad_norm": 5.107640266418457, + "learning_rate": 1.1754062526078487e-06, + "loss": 0.537, + "num_input_tokens_seen": 17701856, + "step": 27120 + }, + { + "epoch": 15.993514150943396, + "grad_norm": 5.500094890594482, + "learning_rate": 1.1737496049185215e-06, + "loss": 0.3308, + "num_input_tokens_seen": 17704864, + "step": 27125 + }, + { + "epoch": 15.996462264150944, + "grad_norm": 4.875102519989014, + "learning_rate": 1.172093970245477e-06, + "loss": 0.3565, + "num_input_tokens_seen": 17708480, + "step": 27130 + }, + { + "epoch": 15.99941037735849, + "grad_norm": 2.8622496128082275, + "learning_rate": 1.1704393490270516e-06, + "loss": 0.3286, + "num_input_tokens_seen": 17712032, + "step": 27135 + }, + { + "epoch": 16.0, + "eval_loss": 0.6074872016906738, + "eval_runtime": 19.3398, + "eval_samples_per_second": 87.695, + "eval_steps_per_second": 21.924, + "num_input_tokens_seen": 17712104, + "step": 27136 + }, + { + "epoch": 16.00235849056604, + "grad_norm": 2.9562137126922607, + "learning_rate": 1.1687857417013126e-06, + "loss": 0.2119, + "num_input_tokens_seen": 17715048, + "step": 27140 + }, + { + "epoch": 16.005306603773583, + "grad_norm": 3.408649206161499, + "learning_rate": 1.1671331487060583e-06, + "loss": 0.3081, + "num_input_tokens_seen": 17717992, + "step": 27145 + }, + { + "epoch": 16.00825471698113, + "grad_norm": 4.2913641929626465, + "learning_rate": 1.1654815704788237e-06, + "loss": 0.3316, + "num_input_tokens_seen": 17720328, + "step": 27150 + }, + { + "epoch": 16.01120283018868, + "grad_norm": 2.9438507556915283, + "learning_rate": 1.1638310074568687e-06, + "loss": 0.2589, + "num_input_tokens_seen": 17723880, + "step": 27155 + }, + { + "epoch": 16.014150943396228, + "grad_norm": 3.5376648902893066, + "learning_rate": 1.162181460077188e-06, + "loss": 0.2891, + "num_input_tokens_seen": 17727944, + "step": 27160 + }, + { + "epoch": 16.017099056603772, + "grad_norm": 2.6878323554992676, + "learning_rate": 1.1605329287765056e-06, + "loss": 0.2962, + "num_input_tokens_seen": 17731848, + "step": 27165 + }, + { + "epoch": 16.02004716981132, + "grad_norm": 6.028411388397217, + "learning_rate": 1.1588854139912775e-06, + "loss": 0.2633, + "num_input_tokens_seen": 17734536, + "step": 27170 + }, + { + "epoch": 16.02299528301887, + "grad_norm": 2.871877908706665, + "learning_rate": 1.1572389161576886e-06, + "loss": 0.2476, + "num_input_tokens_seen": 17738312, + "step": 27175 + }, + { + "epoch": 16.025943396226417, + "grad_norm": 3.234715461730957, + "learning_rate": 1.15559343571166e-06, + "loss": 0.4312, + "num_input_tokens_seen": 17741480, + "step": 27180 + }, + { + "epoch": 16.02889150943396, + "grad_norm": 6.845321178436279, + "learning_rate": 1.153948973088837e-06, + "loss": 0.3601, + "num_input_tokens_seen": 17744520, + "step": 27185 + }, + { + "epoch": 16.03183962264151, + "grad_norm": 3.866924524307251, + "learning_rate": 1.1523055287245993e-06, + "loss": 0.2473, + "num_input_tokens_seen": 17747080, + "step": 27190 + }, + { + "epoch": 16.034787735849058, + "grad_norm": 4.404837131500244, + "learning_rate": 1.150663103054056e-06, + "loss": 0.3196, + "num_input_tokens_seen": 17750632, + "step": 27195 + }, + { + "epoch": 16.037735849056602, + "grad_norm": 4.610869407653809, + "learning_rate": 1.1490216965120438e-06, + "loss": 0.2132, + "num_input_tokens_seen": 17753864, + "step": 27200 + }, + { + "epoch": 16.04068396226415, + "grad_norm": 3.881624937057495, + "learning_rate": 1.147381309533136e-06, + "loss": 0.3224, + "num_input_tokens_seen": 17756744, + "step": 27205 + }, + { + "epoch": 16.0436320754717, + "grad_norm": 3.3644521236419678, + "learning_rate": 1.1457419425516287e-06, + "loss": 0.2373, + "num_input_tokens_seen": 17759720, + "step": 27210 + }, + { + "epoch": 16.046580188679247, + "grad_norm": 2.6429624557495117, + "learning_rate": 1.1441035960015544e-06, + "loss": 0.3176, + "num_input_tokens_seen": 17762792, + "step": 27215 + }, + { + "epoch": 16.04952830188679, + "grad_norm": 5.130011558532715, + "learning_rate": 1.1424662703166716e-06, + "loss": 0.2959, + "num_input_tokens_seen": 17765960, + "step": 27220 + }, + { + "epoch": 16.05247641509434, + "grad_norm": 4.988994598388672, + "learning_rate": 1.1408299659304684e-06, + "loss": 0.4118, + "num_input_tokens_seen": 17769160, + "step": 27225 + }, + { + "epoch": 16.055424528301888, + "grad_norm": 4.97374963760376, + "learning_rate": 1.1391946832761642e-06, + "loss": 0.2657, + "num_input_tokens_seen": 17772424, + "step": 27230 + }, + { + "epoch": 16.058372641509433, + "grad_norm": 1.6416126489639282, + "learning_rate": 1.137560422786706e-06, + "loss": 0.2424, + "num_input_tokens_seen": 17775336, + "step": 27235 + }, + { + "epoch": 16.06132075471698, + "grad_norm": 3.584343194961548, + "learning_rate": 1.1359271848947712e-06, + "loss": 0.3095, + "num_input_tokens_seen": 17778600, + "step": 27240 + }, + { + "epoch": 16.06426886792453, + "grad_norm": 3.2108006477355957, + "learning_rate": 1.1342949700327688e-06, + "loss": 0.4226, + "num_input_tokens_seen": 17782760, + "step": 27245 + }, + { + "epoch": 16.067216981132077, + "grad_norm": 2.745878219604492, + "learning_rate": 1.1326637786328332e-06, + "loss": 0.4028, + "num_input_tokens_seen": 17786952, + "step": 27250 + }, + { + "epoch": 16.07016509433962, + "grad_norm": 3.7533814907073975, + "learning_rate": 1.1310336111268293e-06, + "loss": 0.3196, + "num_input_tokens_seen": 17789544, + "step": 27255 + }, + { + "epoch": 16.07311320754717, + "grad_norm": 2.7761082649230957, + "learning_rate": 1.1294044679463517e-06, + "loss": 0.3191, + "num_input_tokens_seen": 17793576, + "step": 27260 + }, + { + "epoch": 16.076061320754718, + "grad_norm": 2.255887269973755, + "learning_rate": 1.1277763495227207e-06, + "loss": 0.2853, + "num_input_tokens_seen": 17797896, + "step": 27265 + }, + { + "epoch": 16.079009433962263, + "grad_norm": 3.3480160236358643, + "learning_rate": 1.1261492562869913e-06, + "loss": 0.2075, + "num_input_tokens_seen": 17800712, + "step": 27270 + }, + { + "epoch": 16.08195754716981, + "grad_norm": 4.009567737579346, + "learning_rate": 1.1245231886699415e-06, + "loss": 0.2727, + "num_input_tokens_seen": 17803592, + "step": 27275 + }, + { + "epoch": 16.08490566037736, + "grad_norm": 3.303662061691284, + "learning_rate": 1.12289814710208e-06, + "loss": 0.3031, + "num_input_tokens_seen": 17806920, + "step": 27280 + }, + { + "epoch": 16.087853773584907, + "grad_norm": 3.079800605773926, + "learning_rate": 1.1212741320136433e-06, + "loss": 0.3875, + "num_input_tokens_seen": 17810728, + "step": 27285 + }, + { + "epoch": 16.090801886792452, + "grad_norm": 6.233506202697754, + "learning_rate": 1.1196511438345963e-06, + "loss": 0.3086, + "num_input_tokens_seen": 17813928, + "step": 27290 + }, + { + "epoch": 16.09375, + "grad_norm": 4.258075714111328, + "learning_rate": 1.118029182994631e-06, + "loss": 0.2685, + "num_input_tokens_seen": 17817128, + "step": 27295 + }, + { + "epoch": 16.096698113207548, + "grad_norm": 2.6861300468444824, + "learning_rate": 1.1164082499231704e-06, + "loss": 0.3804, + "num_input_tokens_seen": 17820968, + "step": 27300 + }, + { + "epoch": 16.099646226415093, + "grad_norm": 3.8812906742095947, + "learning_rate": 1.114788345049364e-06, + "loss": 0.266, + "num_input_tokens_seen": 17823944, + "step": 27305 + }, + { + "epoch": 16.10259433962264, + "grad_norm": 5.270434379577637, + "learning_rate": 1.1131694688020872e-06, + "loss": 0.4085, + "num_input_tokens_seen": 17826824, + "step": 27310 + }, + { + "epoch": 16.10554245283019, + "grad_norm": 3.719679832458496, + "learning_rate": 1.1115516216099453e-06, + "loss": 0.3681, + "num_input_tokens_seen": 17829800, + "step": 27315 + }, + { + "epoch": 16.108490566037737, + "grad_norm": 3.4988315105438232, + "learning_rate": 1.1099348039012698e-06, + "loss": 0.3387, + "num_input_tokens_seen": 17832744, + "step": 27320 + }, + { + "epoch": 16.111438679245282, + "grad_norm": 5.051823616027832, + "learning_rate": 1.1083190161041202e-06, + "loss": 0.3687, + "num_input_tokens_seen": 17835240, + "step": 27325 + }, + { + "epoch": 16.11438679245283, + "grad_norm": 4.022293567657471, + "learning_rate": 1.1067042586462822e-06, + "loss": 0.2662, + "num_input_tokens_seen": 17838024, + "step": 27330 + }, + { + "epoch": 16.11733490566038, + "grad_norm": 4.672703266143799, + "learning_rate": 1.1050905319552718e-06, + "loss": 0.4647, + "num_input_tokens_seen": 17841032, + "step": 27335 + }, + { + "epoch": 16.120283018867923, + "grad_norm": 3.8369011878967285, + "learning_rate": 1.1034778364583293e-06, + "loss": 0.2946, + "num_input_tokens_seen": 17844968, + "step": 27340 + }, + { + "epoch": 16.12323113207547, + "grad_norm": 4.505640506744385, + "learning_rate": 1.1018661725824231e-06, + "loss": 0.3007, + "num_input_tokens_seen": 17847464, + "step": 27345 + }, + { + "epoch": 16.12617924528302, + "grad_norm": 3.8580055236816406, + "learning_rate": 1.100255540754247e-06, + "loss": 0.3636, + "num_input_tokens_seen": 17850504, + "step": 27350 + }, + { + "epoch": 16.129127358490567, + "grad_norm": 3.8682162761688232, + "learning_rate": 1.0986459414002244e-06, + "loss": 0.3576, + "num_input_tokens_seen": 17854248, + "step": 27355 + }, + { + "epoch": 16.132075471698112, + "grad_norm": 3.619281053543091, + "learning_rate": 1.0970373749465008e-06, + "loss": 0.354, + "num_input_tokens_seen": 17858600, + "step": 27360 + }, + { + "epoch": 16.13502358490566, + "grad_norm": 4.133915424346924, + "learning_rate": 1.095429841818954e-06, + "loss": 0.4058, + "num_input_tokens_seen": 17861928, + "step": 27365 + }, + { + "epoch": 16.13797169811321, + "grad_norm": 6.480722427368164, + "learning_rate": 1.093823342443185e-06, + "loss": 0.4544, + "num_input_tokens_seen": 17864584, + "step": 27370 + }, + { + "epoch": 16.140919811320753, + "grad_norm": 8.885051727294922, + "learning_rate": 1.0922178772445203e-06, + "loss": 0.3409, + "num_input_tokens_seen": 17867784, + "step": 27375 + }, + { + "epoch": 16.1438679245283, + "grad_norm": 3.8647215366363525, + "learning_rate": 1.0906134466480146e-06, + "loss": 0.3639, + "num_input_tokens_seen": 17871560, + "step": 27380 + }, + { + "epoch": 16.14681603773585, + "grad_norm": 4.726562023162842, + "learning_rate": 1.0890100510784473e-06, + "loss": 0.354, + "num_input_tokens_seen": 17874440, + "step": 27385 + }, + { + "epoch": 16.149764150943398, + "grad_norm": 3.3339004516601562, + "learning_rate": 1.0874076909603227e-06, + "loss": 0.4718, + "num_input_tokens_seen": 17880584, + "step": 27390 + }, + { + "epoch": 16.152712264150942, + "grad_norm": 4.483731269836426, + "learning_rate": 1.0858063667178747e-06, + "loss": 0.3215, + "num_input_tokens_seen": 17884360, + "step": 27395 + }, + { + "epoch": 16.15566037735849, + "grad_norm": 4.394938945770264, + "learning_rate": 1.0842060787750614e-06, + "loss": 0.4204, + "num_input_tokens_seen": 17887272, + "step": 27400 + }, + { + "epoch": 16.15860849056604, + "grad_norm": 3.872849225997925, + "learning_rate": 1.0826068275555652e-06, + "loss": 0.2962, + "num_input_tokens_seen": 17890888, + "step": 27405 + }, + { + "epoch": 16.161556603773583, + "grad_norm": 3.1812868118286133, + "learning_rate": 1.081008613482794e-06, + "loss": 0.2733, + "num_input_tokens_seen": 17894440, + "step": 27410 + }, + { + "epoch": 16.16450471698113, + "grad_norm": 2.7634010314941406, + "learning_rate": 1.079411436979883e-06, + "loss": 0.2423, + "num_input_tokens_seen": 17898184, + "step": 27415 + }, + { + "epoch": 16.16745283018868, + "grad_norm": 3.4479756355285645, + "learning_rate": 1.0778152984696905e-06, + "loss": 0.318, + "num_input_tokens_seen": 17901576, + "step": 27420 + }, + { + "epoch": 16.170400943396228, + "grad_norm": 1.9659678936004639, + "learning_rate": 1.0762201983747993e-06, + "loss": 0.2908, + "num_input_tokens_seen": 17905640, + "step": 27425 + }, + { + "epoch": 16.173349056603772, + "grad_norm": 2.5028533935546875, + "learning_rate": 1.0746261371175238e-06, + "loss": 0.2446, + "num_input_tokens_seen": 17909224, + "step": 27430 + }, + { + "epoch": 16.17629716981132, + "grad_norm": 3.786068916320801, + "learning_rate": 1.0730331151198953e-06, + "loss": 0.2277, + "num_input_tokens_seen": 17912232, + "step": 27435 + }, + { + "epoch": 16.17924528301887, + "grad_norm": 4.085115909576416, + "learning_rate": 1.0714411328036733e-06, + "loss": 0.287, + "num_input_tokens_seen": 17915464, + "step": 27440 + }, + { + "epoch": 16.182193396226417, + "grad_norm": 2.496640682220459, + "learning_rate": 1.0698501905903435e-06, + "loss": 0.2722, + "num_input_tokens_seen": 17918120, + "step": 27445 + }, + { + "epoch": 16.18514150943396, + "grad_norm": 2.3432068824768066, + "learning_rate": 1.0682602889011134e-06, + "loss": 0.3386, + "num_input_tokens_seen": 17921704, + "step": 27450 + }, + { + "epoch": 16.18808962264151, + "grad_norm": 4.427986145019531, + "learning_rate": 1.0666714281569152e-06, + "loss": 0.225, + "num_input_tokens_seen": 17924616, + "step": 27455 + }, + { + "epoch": 16.191037735849058, + "grad_norm": 3.4739325046539307, + "learning_rate": 1.0650836087784095e-06, + "loss": 0.3371, + "num_input_tokens_seen": 17927240, + "step": 27460 + }, + { + "epoch": 16.193985849056602, + "grad_norm": 2.6365692615509033, + "learning_rate": 1.0634968311859768e-06, + "loss": 0.2921, + "num_input_tokens_seen": 17929704, + "step": 27465 + }, + { + "epoch": 16.19693396226415, + "grad_norm": 2.857795000076294, + "learning_rate": 1.0619110957997237e-06, + "loss": 0.3896, + "num_input_tokens_seen": 17932392, + "step": 27470 + }, + { + "epoch": 16.1998820754717, + "grad_norm": 2.2339065074920654, + "learning_rate": 1.06032640303948e-06, + "loss": 0.2726, + "num_input_tokens_seen": 17936264, + "step": 27475 + }, + { + "epoch": 16.202830188679247, + "grad_norm": 3.4672091007232666, + "learning_rate": 1.0587427533248002e-06, + "loss": 0.4424, + "num_input_tokens_seen": 17939848, + "step": 27480 + }, + { + "epoch": 16.20577830188679, + "grad_norm": 3.4370508193969727, + "learning_rate": 1.057160147074961e-06, + "loss": 0.2593, + "num_input_tokens_seen": 17943080, + "step": 27485 + }, + { + "epoch": 16.20872641509434, + "grad_norm": 3.60244083404541, + "learning_rate": 1.0555785847089657e-06, + "loss": 0.2887, + "num_input_tokens_seen": 17945384, + "step": 27490 + }, + { + "epoch": 16.211674528301888, + "grad_norm": 2.8007898330688477, + "learning_rate": 1.0539980666455407e-06, + "loss": 0.379, + "num_input_tokens_seen": 17948488, + "step": 27495 + }, + { + "epoch": 16.214622641509433, + "grad_norm": 3.984285593032837, + "learning_rate": 1.052418593303134e-06, + "loss": 0.3682, + "num_input_tokens_seen": 17953000, + "step": 27500 + }, + { + "epoch": 16.21757075471698, + "grad_norm": 3.0932931900024414, + "learning_rate": 1.0508401650999178e-06, + "loss": 0.2317, + "num_input_tokens_seen": 17956648, + "step": 27505 + }, + { + "epoch": 16.22051886792453, + "grad_norm": 5.290530204772949, + "learning_rate": 1.0492627824537877e-06, + "loss": 0.2588, + "num_input_tokens_seen": 17959368, + "step": 27510 + }, + { + "epoch": 16.223466981132077, + "grad_norm": 4.802031517028809, + "learning_rate": 1.0476864457823626e-06, + "loss": 0.2621, + "num_input_tokens_seen": 17963016, + "step": 27515 + }, + { + "epoch": 16.22641509433962, + "grad_norm": 8.34684944152832, + "learning_rate": 1.0461111555029836e-06, + "loss": 0.4282, + "num_input_tokens_seen": 17965448, + "step": 27520 + }, + { + "epoch": 16.22936320754717, + "grad_norm": 2.3559815883636475, + "learning_rate": 1.0445369120327175e-06, + "loss": 0.2906, + "num_input_tokens_seen": 17969512, + "step": 27525 + }, + { + "epoch": 16.232311320754718, + "grad_norm": 5.384519577026367, + "learning_rate": 1.0429637157883516e-06, + "loss": 0.4581, + "num_input_tokens_seen": 17973000, + "step": 27530 + }, + { + "epoch": 16.235259433962263, + "grad_norm": 2.6878702640533447, + "learning_rate": 1.041391567186395e-06, + "loss": 0.2866, + "num_input_tokens_seen": 17976232, + "step": 27535 + }, + { + "epoch": 16.23820754716981, + "grad_norm": 4.6357550621032715, + "learning_rate": 1.0398204666430821e-06, + "loss": 0.2283, + "num_input_tokens_seen": 17978728, + "step": 27540 + }, + { + "epoch": 16.24115566037736, + "grad_norm": 5.27734375, + "learning_rate": 1.0382504145743667e-06, + "loss": 0.229, + "num_input_tokens_seen": 17982120, + "step": 27545 + }, + { + "epoch": 16.244103773584907, + "grad_norm": 3.3389225006103516, + "learning_rate": 1.0366814113959294e-06, + "loss": 0.2887, + "num_input_tokens_seen": 17985192, + "step": 27550 + }, + { + "epoch": 16.247051886792452, + "grad_norm": 4.908576011657715, + "learning_rate": 1.0351134575231697e-06, + "loss": 0.2458, + "num_input_tokens_seen": 17988456, + "step": 27555 + }, + { + "epoch": 16.25, + "grad_norm": 2.8062827587127686, + "learning_rate": 1.0335465533712098e-06, + "loss": 0.2224, + "num_input_tokens_seen": 17991272, + "step": 27560 + }, + { + "epoch": 16.252948113207548, + "grad_norm": 4.088833332061768, + "learning_rate": 1.031980699354894e-06, + "loss": 0.2824, + "num_input_tokens_seen": 17994792, + "step": 27565 + }, + { + "epoch": 16.255896226415093, + "grad_norm": 3.451603651046753, + "learning_rate": 1.03041589588879e-06, + "loss": 0.2755, + "num_input_tokens_seen": 17997992, + "step": 27570 + }, + { + "epoch": 16.25884433962264, + "grad_norm": 3.2184457778930664, + "learning_rate": 1.0288521433871834e-06, + "loss": 0.2818, + "num_input_tokens_seen": 18001256, + "step": 27575 + }, + { + "epoch": 16.26179245283019, + "grad_norm": 3.632542610168457, + "learning_rate": 1.0272894422640866e-06, + "loss": 0.3305, + "num_input_tokens_seen": 18004008, + "step": 27580 + }, + { + "epoch": 16.264740566037737, + "grad_norm": 4.782447814941406, + "learning_rate": 1.0257277929332332e-06, + "loss": 0.2132, + "num_input_tokens_seen": 18007176, + "step": 27585 + }, + { + "epoch": 16.267688679245282, + "grad_norm": 1.9274590015411377, + "learning_rate": 1.0241671958080745e-06, + "loss": 0.3307, + "num_input_tokens_seen": 18011240, + "step": 27590 + }, + { + "epoch": 16.27063679245283, + "grad_norm": 4.54290246963501, + "learning_rate": 1.0226076513017858e-06, + "loss": 0.3755, + "num_input_tokens_seen": 18014664, + "step": 27595 + }, + { + "epoch": 16.27358490566038, + "grad_norm": 5.171290397644043, + "learning_rate": 1.0210491598272625e-06, + "loss": 0.2907, + "num_input_tokens_seen": 18018536, + "step": 27600 + }, + { + "epoch": 16.276533018867923, + "grad_norm": 4.232462406158447, + "learning_rate": 1.0194917217971229e-06, + "loss": 0.4067, + "num_input_tokens_seen": 18022312, + "step": 27605 + }, + { + "epoch": 16.27948113207547, + "grad_norm": 4.152926921844482, + "learning_rate": 1.0179353376237038e-06, + "loss": 0.2838, + "num_input_tokens_seen": 18024968, + "step": 27610 + }, + { + "epoch": 16.28242924528302, + "grad_norm": 2.1672232151031494, + "learning_rate": 1.0163800077190672e-06, + "loss": 0.2966, + "num_input_tokens_seen": 18028104, + "step": 27615 + }, + { + "epoch": 16.285377358490567, + "grad_norm": 2.8170223236083984, + "learning_rate": 1.0148257324949916e-06, + "loss": 0.3349, + "num_input_tokens_seen": 18031080, + "step": 27620 + }, + { + "epoch": 16.288325471698112, + "grad_norm": 3.1752476692199707, + "learning_rate": 1.0132725123629783e-06, + "loss": 0.2549, + "num_input_tokens_seen": 18033832, + "step": 27625 + }, + { + "epoch": 16.29127358490566, + "grad_norm": 4.458883762359619, + "learning_rate": 1.0117203477342497e-06, + "loss": 0.3281, + "num_input_tokens_seen": 18036264, + "step": 27630 + }, + { + "epoch": 16.29422169811321, + "grad_norm": 4.837848663330078, + "learning_rate": 1.0101692390197477e-06, + "loss": 0.3533, + "num_input_tokens_seen": 18040616, + "step": 27635 + }, + { + "epoch": 16.297169811320753, + "grad_norm": 3.146350145339966, + "learning_rate": 1.0086191866301331e-06, + "loss": 0.2777, + "num_input_tokens_seen": 18044712, + "step": 27640 + }, + { + "epoch": 16.3001179245283, + "grad_norm": 4.456493854522705, + "learning_rate": 1.0070701909757918e-06, + "loss": 0.2246, + "num_input_tokens_seen": 18049256, + "step": 27645 + }, + { + "epoch": 16.30306603773585, + "grad_norm": 3.4150750637054443, + "learning_rate": 1.0055222524668267e-06, + "loss": 0.383, + "num_input_tokens_seen": 18053032, + "step": 27650 + }, + { + "epoch": 16.306014150943398, + "grad_norm": 4.103562355041504, + "learning_rate": 1.00397537151306e-06, + "loss": 0.3418, + "num_input_tokens_seen": 18055368, + "step": 27655 + }, + { + "epoch": 16.308962264150942, + "grad_norm": 3.3517467975616455, + "learning_rate": 1.002429548524036e-06, + "loss": 0.4552, + "num_input_tokens_seen": 18058856, + "step": 27660 + }, + { + "epoch": 16.31191037735849, + "grad_norm": 4.0385050773620605, + "learning_rate": 1.0008847839090175e-06, + "loss": 0.2833, + "num_input_tokens_seen": 18062280, + "step": 27665 + }, + { + "epoch": 16.31485849056604, + "grad_norm": 3.517468214035034, + "learning_rate": 9.993410780769862e-07, + "loss": 0.294, + "num_input_tokens_seen": 18065288, + "step": 27670 + }, + { + "epoch": 16.317806603773583, + "grad_norm": 3.2257468700408936, + "learning_rate": 9.977984314366463e-07, + "loss": 0.3718, + "num_input_tokens_seen": 18069352, + "step": 27675 + }, + { + "epoch": 16.32075471698113, + "grad_norm": 4.003576278686523, + "learning_rate": 9.962568443964216e-07, + "loss": 0.2307, + "num_input_tokens_seen": 18073224, + "step": 27680 + }, + { + "epoch": 16.32370283018868, + "grad_norm": 3.413637638092041, + "learning_rate": 9.947163173644524e-07, + "loss": 0.4056, + "num_input_tokens_seen": 18075880, + "step": 27685 + }, + { + "epoch": 16.326650943396228, + "grad_norm": 3.6316475868225098, + "learning_rate": 9.931768507486007e-07, + "loss": 0.2761, + "num_input_tokens_seen": 18078792, + "step": 27690 + }, + { + "epoch": 16.329599056603772, + "grad_norm": 6.669281005859375, + "learning_rate": 9.916384449564453e-07, + "loss": 0.3088, + "num_input_tokens_seen": 18082248, + "step": 27695 + }, + { + "epoch": 16.33254716981132, + "grad_norm": 2.2219507694244385, + "learning_rate": 9.90101100395287e-07, + "loss": 0.2517, + "num_input_tokens_seen": 18085384, + "step": 27700 + }, + { + "epoch": 16.33549528301887, + "grad_norm": 2.369577407836914, + "learning_rate": 9.885648174721428e-07, + "loss": 0.332, + "num_input_tokens_seen": 18088616, + "step": 27705 + }, + { + "epoch": 16.338443396226417, + "grad_norm": 2.9724032878875732, + "learning_rate": 9.870295965937532e-07, + "loss": 0.2902, + "num_input_tokens_seen": 18092808, + "step": 27710 + }, + { + "epoch": 16.34139150943396, + "grad_norm": 3.9808011054992676, + "learning_rate": 9.854954381665727e-07, + "loss": 0.396, + "num_input_tokens_seen": 18095656, + "step": 27715 + }, + { + "epoch": 16.34433962264151, + "grad_norm": 4.299487590789795, + "learning_rate": 9.83962342596776e-07, + "loss": 0.2304, + "num_input_tokens_seen": 18098120, + "step": 27720 + }, + { + "epoch": 16.347287735849058, + "grad_norm": 2.4551925659179688, + "learning_rate": 9.824303102902576e-07, + "loss": 0.2218, + "num_input_tokens_seen": 18101768, + "step": 27725 + }, + { + "epoch": 16.350235849056602, + "grad_norm": 5.087007522583008, + "learning_rate": 9.808993416526292e-07, + "loss": 0.33, + "num_input_tokens_seen": 18105352, + "step": 27730 + }, + { + "epoch": 16.35318396226415, + "grad_norm": 4.335859298706055, + "learning_rate": 9.793694370892204e-07, + "loss": 0.2587, + "num_input_tokens_seen": 18110600, + "step": 27735 + }, + { + "epoch": 16.3561320754717, + "grad_norm": 3.5604605674743652, + "learning_rate": 9.77840597005082e-07, + "loss": 0.2693, + "num_input_tokens_seen": 18114440, + "step": 27740 + }, + { + "epoch": 16.359080188679247, + "grad_norm": 4.816673278808594, + "learning_rate": 9.763128218049806e-07, + "loss": 0.4437, + "num_input_tokens_seen": 18117352, + "step": 27745 + }, + { + "epoch": 16.36202830188679, + "grad_norm": 2.940211772918701, + "learning_rate": 9.747861118934005e-07, + "loss": 0.3237, + "num_input_tokens_seen": 18120680, + "step": 27750 + }, + { + "epoch": 16.36497641509434, + "grad_norm": 4.41790246963501, + "learning_rate": 9.732604676745443e-07, + "loss": 0.2719, + "num_input_tokens_seen": 18123240, + "step": 27755 + }, + { + "epoch": 16.367924528301888, + "grad_norm": 5.458648681640625, + "learning_rate": 9.717358895523333e-07, + "loss": 0.3171, + "num_input_tokens_seen": 18125832, + "step": 27760 + }, + { + "epoch": 16.370872641509433, + "grad_norm": 3.7510673999786377, + "learning_rate": 9.702123779304074e-07, + "loss": 0.2841, + "num_input_tokens_seen": 18128872, + "step": 27765 + }, + { + "epoch": 16.37382075471698, + "grad_norm": 3.298093795776367, + "learning_rate": 9.686899332121203e-07, + "loss": 0.2985, + "num_input_tokens_seen": 18131688, + "step": 27770 + }, + { + "epoch": 16.37676886792453, + "grad_norm": 1.7564769983291626, + "learning_rate": 9.671685558005488e-07, + "loss": 0.1736, + "num_input_tokens_seen": 18135304, + "step": 27775 + }, + { + "epoch": 16.379716981132077, + "grad_norm": 4.581481456756592, + "learning_rate": 9.656482460984828e-07, + "loss": 0.2442, + "num_input_tokens_seen": 18138504, + "step": 27780 + }, + { + "epoch": 16.38266509433962, + "grad_norm": 2.838304281234741, + "learning_rate": 9.641290045084307e-07, + "loss": 0.3456, + "num_input_tokens_seen": 18141864, + "step": 27785 + }, + { + "epoch": 16.38561320754717, + "grad_norm": 3.9666428565979004, + "learning_rate": 9.626108314326182e-07, + "loss": 0.2506, + "num_input_tokens_seen": 18144872, + "step": 27790 + }, + { + "epoch": 16.388561320754718, + "grad_norm": 3.7685439586639404, + "learning_rate": 9.610937272729881e-07, + "loss": 0.2807, + "num_input_tokens_seen": 18149448, + "step": 27795 + }, + { + "epoch": 16.391509433962263, + "grad_norm": 2.9581820964813232, + "learning_rate": 9.595776924311996e-07, + "loss": 0.4135, + "num_input_tokens_seen": 18152968, + "step": 27800 + }, + { + "epoch": 16.39445754716981, + "grad_norm": 8.78770923614502, + "learning_rate": 9.580627273086313e-07, + "loss": 0.3678, + "num_input_tokens_seen": 18156264, + "step": 27805 + }, + { + "epoch": 16.39740566037736, + "grad_norm": 5.531522274017334, + "learning_rate": 9.565488323063754e-07, + "loss": 0.2459, + "num_input_tokens_seen": 18159112, + "step": 27810 + }, + { + "epoch": 16.400353773584907, + "grad_norm": 4.561558723449707, + "learning_rate": 9.55036007825243e-07, + "loss": 0.3786, + "num_input_tokens_seen": 18161800, + "step": 27815 + }, + { + "epoch": 16.403301886792452, + "grad_norm": 4.129857063293457, + "learning_rate": 9.535242542657602e-07, + "loss": 0.3418, + "num_input_tokens_seen": 18164808, + "step": 27820 + }, + { + "epoch": 16.40625, + "grad_norm": 4.079511642456055, + "learning_rate": 9.520135720281692e-07, + "loss": 0.508, + "num_input_tokens_seen": 18168072, + "step": 27825 + }, + { + "epoch": 16.409198113207548, + "grad_norm": 5.035636901855469, + "learning_rate": 9.505039615124318e-07, + "loss": 0.2945, + "num_input_tokens_seen": 18171464, + "step": 27830 + }, + { + "epoch": 16.412146226415093, + "grad_norm": 3.3385486602783203, + "learning_rate": 9.489954231182235e-07, + "loss": 0.3909, + "num_input_tokens_seen": 18174824, + "step": 27835 + }, + { + "epoch": 16.41509433962264, + "grad_norm": 3.351029872894287, + "learning_rate": 9.474879572449352e-07, + "loss": 0.309, + "num_input_tokens_seen": 18177672, + "step": 27840 + }, + { + "epoch": 16.41804245283019, + "grad_norm": 4.073945045471191, + "learning_rate": 9.459815642916759e-07, + "loss": 0.2923, + "num_input_tokens_seen": 18180680, + "step": 27845 + }, + { + "epoch": 16.420990566037737, + "grad_norm": 3.6125144958496094, + "learning_rate": 9.444762446572692e-07, + "loss": 0.1989, + "num_input_tokens_seen": 18183176, + "step": 27850 + }, + { + "epoch": 16.423938679245282, + "grad_norm": 1.9218395948410034, + "learning_rate": 9.429719987402541e-07, + "loss": 0.2629, + "num_input_tokens_seen": 18186920, + "step": 27855 + }, + { + "epoch": 16.42688679245283, + "grad_norm": 3.80049729347229, + "learning_rate": 9.414688269388883e-07, + "loss": 0.3354, + "num_input_tokens_seen": 18190248, + "step": 27860 + }, + { + "epoch": 16.42983490566038, + "grad_norm": 6.680559158325195, + "learning_rate": 9.3996672965114e-07, + "loss": 0.264, + "num_input_tokens_seen": 18193320, + "step": 27865 + }, + { + "epoch": 16.432783018867923, + "grad_norm": 3.0099081993103027, + "learning_rate": 9.384657072747e-07, + "loss": 0.2176, + "num_input_tokens_seen": 18196776, + "step": 27870 + }, + { + "epoch": 16.43573113207547, + "grad_norm": 4.875786781311035, + "learning_rate": 9.369657602069676e-07, + "loss": 0.3374, + "num_input_tokens_seen": 18199496, + "step": 27875 + }, + { + "epoch": 16.43867924528302, + "grad_norm": 3.948143243789673, + "learning_rate": 9.354668888450608e-07, + "loss": 0.371, + "num_input_tokens_seen": 18201800, + "step": 27880 + }, + { + "epoch": 16.441627358490567, + "grad_norm": 2.5716280937194824, + "learning_rate": 9.339690935858125e-07, + "loss": 0.3468, + "num_input_tokens_seen": 18204648, + "step": 27885 + }, + { + "epoch": 16.444575471698112, + "grad_norm": 4.36135721206665, + "learning_rate": 9.324723748257697e-07, + "loss": 0.3576, + "num_input_tokens_seen": 18207528, + "step": 27890 + }, + { + "epoch": 16.44752358490566, + "grad_norm": 3.9799201488494873, + "learning_rate": 9.309767329611963e-07, + "loss": 0.3242, + "num_input_tokens_seen": 18210568, + "step": 27895 + }, + { + "epoch": 16.45047169811321, + "grad_norm": 2.681002616882324, + "learning_rate": 9.294821683880695e-07, + "loss": 0.2337, + "num_input_tokens_seen": 18213544, + "step": 27900 + }, + { + "epoch": 16.453419811320753, + "grad_norm": 2.5427801609039307, + "learning_rate": 9.279886815020816e-07, + "loss": 0.2543, + "num_input_tokens_seen": 18217352, + "step": 27905 + }, + { + "epoch": 16.4563679245283, + "grad_norm": 3.7799453735351562, + "learning_rate": 9.264962726986393e-07, + "loss": 0.3855, + "num_input_tokens_seen": 18220456, + "step": 27910 + }, + { + "epoch": 16.45931603773585, + "grad_norm": 1.9801958799362183, + "learning_rate": 9.250049423728652e-07, + "loss": 0.2477, + "num_input_tokens_seen": 18223880, + "step": 27915 + }, + { + "epoch": 16.462264150943398, + "grad_norm": 4.047691345214844, + "learning_rate": 9.235146909195936e-07, + "loss": 0.3087, + "num_input_tokens_seen": 18226856, + "step": 27920 + }, + { + "epoch": 16.465212264150942, + "grad_norm": 4.105132579803467, + "learning_rate": 9.220255187333771e-07, + "loss": 0.2958, + "num_input_tokens_seen": 18230248, + "step": 27925 + }, + { + "epoch": 16.46816037735849, + "grad_norm": 7.007116317749023, + "learning_rate": 9.205374262084798e-07, + "loss": 0.3608, + "num_input_tokens_seen": 18232136, + "step": 27930 + }, + { + "epoch": 16.47110849056604, + "grad_norm": 3.514346122741699, + "learning_rate": 9.190504137388806e-07, + "loss": 0.4124, + "num_input_tokens_seen": 18236552, + "step": 27935 + }, + { + "epoch": 16.474056603773583, + "grad_norm": 4.259034633636475, + "learning_rate": 9.175644817182722e-07, + "loss": 0.3101, + "num_input_tokens_seen": 18239240, + "step": 27940 + }, + { + "epoch": 16.47700471698113, + "grad_norm": 2.2055740356445312, + "learning_rate": 9.16079630540061e-07, + "loss": 0.3018, + "num_input_tokens_seen": 18242088, + "step": 27945 + }, + { + "epoch": 16.47995283018868, + "grad_norm": 3.444841146469116, + "learning_rate": 9.145958605973676e-07, + "loss": 0.3046, + "num_input_tokens_seen": 18244936, + "step": 27950 + }, + { + "epoch": 16.482900943396228, + "grad_norm": 4.489775657653809, + "learning_rate": 9.131131722830289e-07, + "loss": 0.3508, + "num_input_tokens_seen": 18248520, + "step": 27955 + }, + { + "epoch": 16.485849056603772, + "grad_norm": 4.1201491355896, + "learning_rate": 9.116315659895892e-07, + "loss": 0.387, + "num_input_tokens_seen": 18251752, + "step": 27960 + }, + { + "epoch": 16.48879716981132, + "grad_norm": 3.685603618621826, + "learning_rate": 9.10151042109314e-07, + "loss": 0.2943, + "num_input_tokens_seen": 18255144, + "step": 27965 + }, + { + "epoch": 16.49174528301887, + "grad_norm": 4.379791736602783, + "learning_rate": 9.086716010341767e-07, + "loss": 0.3224, + "num_input_tokens_seen": 18258376, + "step": 27970 + }, + { + "epoch": 16.494693396226417, + "grad_norm": 3.7092583179473877, + "learning_rate": 9.071932431558655e-07, + "loss": 0.23, + "num_input_tokens_seen": 18261128, + "step": 27975 + }, + { + "epoch": 16.49764150943396, + "grad_norm": 2.9440362453460693, + "learning_rate": 9.057159688657824e-07, + "loss": 0.2526, + "num_input_tokens_seen": 18264360, + "step": 27980 + }, + { + "epoch": 16.50058962264151, + "grad_norm": 3.211334466934204, + "learning_rate": 9.042397785550405e-07, + "loss": 0.2936, + "num_input_tokens_seen": 18267528, + "step": 27985 + }, + { + "epoch": 16.503537735849058, + "grad_norm": 2.7018685340881348, + "learning_rate": 9.027646726144707e-07, + "loss": 0.299, + "num_input_tokens_seen": 18270952, + "step": 27990 + }, + { + "epoch": 16.506485849056602, + "grad_norm": 5.471151351928711, + "learning_rate": 9.012906514346115e-07, + "loss": 0.2675, + "num_input_tokens_seen": 18274472, + "step": 27995 + }, + { + "epoch": 16.50943396226415, + "grad_norm": 3.833176612854004, + "learning_rate": 8.99817715405717e-07, + "loss": 0.2104, + "num_input_tokens_seen": 18278568, + "step": 28000 + }, + { + "epoch": 16.5123820754717, + "grad_norm": 4.792274475097656, + "learning_rate": 8.983458649177529e-07, + "loss": 0.226, + "num_input_tokens_seen": 18280936, + "step": 28005 + }, + { + "epoch": 16.515330188679247, + "grad_norm": 3.603153944015503, + "learning_rate": 8.968751003603982e-07, + "loss": 0.3768, + "num_input_tokens_seen": 18283944, + "step": 28010 + }, + { + "epoch": 16.51827830188679, + "grad_norm": 9.184059143066406, + "learning_rate": 8.95405422123043e-07, + "loss": 0.3887, + "num_input_tokens_seen": 18287144, + "step": 28015 + }, + { + "epoch": 16.52122641509434, + "grad_norm": 2.5323679447174072, + "learning_rate": 8.939368305947932e-07, + "loss": 0.4066, + "num_input_tokens_seen": 18291080, + "step": 28020 + }, + { + "epoch": 16.524174528301888, + "grad_norm": 3.2594141960144043, + "learning_rate": 8.92469326164464e-07, + "loss": 0.281, + "num_input_tokens_seen": 18293768, + "step": 28025 + }, + { + "epoch": 16.527122641509433, + "grad_norm": 3.4369442462921143, + "learning_rate": 8.910029092205829e-07, + "loss": 0.1902, + "num_input_tokens_seen": 18296040, + "step": 28030 + }, + { + "epoch": 16.53007075471698, + "grad_norm": 2.4512956142425537, + "learning_rate": 8.895375801513906e-07, + "loss": 0.1956, + "num_input_tokens_seen": 18300552, + "step": 28035 + }, + { + "epoch": 16.53301886792453, + "grad_norm": 2.2350974082946777, + "learning_rate": 8.880733393448377e-07, + "loss": 0.266, + "num_input_tokens_seen": 18304040, + "step": 28040 + }, + { + "epoch": 16.535966981132077, + "grad_norm": 3.849118232727051, + "learning_rate": 8.866101871885907e-07, + "loss": 0.3377, + "num_input_tokens_seen": 18307016, + "step": 28045 + }, + { + "epoch": 16.53891509433962, + "grad_norm": 3.746448040008545, + "learning_rate": 8.851481240700249e-07, + "loss": 0.3493, + "num_input_tokens_seen": 18310344, + "step": 28050 + }, + { + "epoch": 16.54186320754717, + "grad_norm": 3.6305148601531982, + "learning_rate": 8.836871503762257e-07, + "loss": 0.2163, + "num_input_tokens_seen": 18313704, + "step": 28055 + }, + { + "epoch": 16.544811320754718, + "grad_norm": 5.9658331871032715, + "learning_rate": 8.822272664939946e-07, + "loss": 0.297, + "num_input_tokens_seen": 18317032, + "step": 28060 + }, + { + "epoch": 16.547759433962263, + "grad_norm": 5.061834812164307, + "learning_rate": 8.80768472809842e-07, + "loss": 0.3069, + "num_input_tokens_seen": 18320008, + "step": 28065 + }, + { + "epoch": 16.55070754716981, + "grad_norm": 4.820870876312256, + "learning_rate": 8.793107697099884e-07, + "loss": 0.5205, + "num_input_tokens_seen": 18322888, + "step": 28070 + }, + { + "epoch": 16.55365566037736, + "grad_norm": 5.755917072296143, + "learning_rate": 8.778541575803673e-07, + "loss": 0.3003, + "num_input_tokens_seen": 18325608, + "step": 28075 + }, + { + "epoch": 16.556603773584907, + "grad_norm": 2.8354055881500244, + "learning_rate": 8.763986368066241e-07, + "loss": 0.2158, + "num_input_tokens_seen": 18328616, + "step": 28080 + }, + { + "epoch": 16.559551886792452, + "grad_norm": 3.812364339828491, + "learning_rate": 8.749442077741138e-07, + "loss": 0.2683, + "num_input_tokens_seen": 18331496, + "step": 28085 + }, + { + "epoch": 16.5625, + "grad_norm": 3.0752851963043213, + "learning_rate": 8.734908708679024e-07, + "loss": 0.4226, + "num_input_tokens_seen": 18334440, + "step": 28090 + }, + { + "epoch": 16.565448113207548, + "grad_norm": 8.289811134338379, + "learning_rate": 8.72038626472767e-07, + "loss": 0.3538, + "num_input_tokens_seen": 18337384, + "step": 28095 + }, + { + "epoch": 16.568396226415093, + "grad_norm": 3.749800205230713, + "learning_rate": 8.705874749731962e-07, + "loss": 0.3107, + "num_input_tokens_seen": 18340520, + "step": 28100 + }, + { + "epoch": 16.57134433962264, + "grad_norm": 3.3201005458831787, + "learning_rate": 8.691374167533867e-07, + "loss": 0.3541, + "num_input_tokens_seen": 18343656, + "step": 28105 + }, + { + "epoch": 16.57429245283019, + "grad_norm": 7.969613075256348, + "learning_rate": 8.6768845219725e-07, + "loss": 0.2594, + "num_input_tokens_seen": 18347176, + "step": 28110 + }, + { + "epoch": 16.577240566037737, + "grad_norm": 5.6708292961120605, + "learning_rate": 8.662405816884056e-07, + "loss": 0.3494, + "num_input_tokens_seen": 18350792, + "step": 28115 + }, + { + "epoch": 16.580188679245282, + "grad_norm": 2.214641571044922, + "learning_rate": 8.647938056101824e-07, + "loss": 0.1981, + "num_input_tokens_seen": 18354408, + "step": 28120 + }, + { + "epoch": 16.58313679245283, + "grad_norm": 3.3968143463134766, + "learning_rate": 8.63348124345621e-07, + "loss": 0.3991, + "num_input_tokens_seen": 18357352, + "step": 28125 + }, + { + "epoch": 16.58608490566038, + "grad_norm": 2.846489191055298, + "learning_rate": 8.619035382774716e-07, + "loss": 0.2869, + "num_input_tokens_seen": 18360136, + "step": 28130 + }, + { + "epoch": 16.589033018867923, + "grad_norm": 3.7617170810699463, + "learning_rate": 8.60460047788193e-07, + "loss": 0.2447, + "num_input_tokens_seen": 18363080, + "step": 28135 + }, + { + "epoch": 16.59198113207547, + "grad_norm": 3.938455820083618, + "learning_rate": 8.590176532599587e-07, + "loss": 0.2602, + "num_input_tokens_seen": 18367528, + "step": 28140 + }, + { + "epoch": 16.59492924528302, + "grad_norm": 4.964748859405518, + "learning_rate": 8.575763550746475e-07, + "loss": 0.276, + "num_input_tokens_seen": 18370856, + "step": 28145 + }, + { + "epoch": 16.597877358490567, + "grad_norm": 4.684011936187744, + "learning_rate": 8.56136153613848e-07, + "loss": 0.4144, + "num_input_tokens_seen": 18373864, + "step": 28150 + }, + { + "epoch": 16.600825471698112, + "grad_norm": 3.855015277862549, + "learning_rate": 8.546970492588619e-07, + "loss": 0.2785, + "num_input_tokens_seen": 18377544, + "step": 28155 + }, + { + "epoch": 16.60377358490566, + "grad_norm": 3.310488224029541, + "learning_rate": 8.532590423906973e-07, + "loss": 0.3432, + "num_input_tokens_seen": 18380680, + "step": 28160 + }, + { + "epoch": 16.60672169811321, + "grad_norm": 3.829721689224243, + "learning_rate": 8.518221333900728e-07, + "loss": 0.2575, + "num_input_tokens_seen": 18383560, + "step": 28165 + }, + { + "epoch": 16.609669811320753, + "grad_norm": 3.348219156265259, + "learning_rate": 8.503863226374148e-07, + "loss": 0.3153, + "num_input_tokens_seen": 18386312, + "step": 28170 + }, + { + "epoch": 16.6126179245283, + "grad_norm": 4.839154243469238, + "learning_rate": 8.489516105128632e-07, + "loss": 0.263, + "num_input_tokens_seen": 18389768, + "step": 28175 + }, + { + "epoch": 16.61556603773585, + "grad_norm": 6.341581344604492, + "learning_rate": 8.475179973962621e-07, + "loss": 0.3698, + "num_input_tokens_seen": 18393096, + "step": 28180 + }, + { + "epoch": 16.618514150943398, + "grad_norm": 3.94744610786438, + "learning_rate": 8.460854836671678e-07, + "loss": 0.2733, + "num_input_tokens_seen": 18396136, + "step": 28185 + }, + { + "epoch": 16.621462264150942, + "grad_norm": 5.274620532989502, + "learning_rate": 8.446540697048445e-07, + "loss": 0.2841, + "num_input_tokens_seen": 18399016, + "step": 28190 + }, + { + "epoch": 16.62441037735849, + "grad_norm": 4.905893802642822, + "learning_rate": 8.432237558882639e-07, + "loss": 0.3294, + "num_input_tokens_seen": 18402792, + "step": 28195 + }, + { + "epoch": 16.62735849056604, + "grad_norm": 5.490345478057861, + "learning_rate": 8.417945425961083e-07, + "loss": 0.2806, + "num_input_tokens_seen": 18405736, + "step": 28200 + }, + { + "epoch": 16.630306603773583, + "grad_norm": 4.3853044509887695, + "learning_rate": 8.403664302067688e-07, + "loss": 0.383, + "num_input_tokens_seen": 18408968, + "step": 28205 + }, + { + "epoch": 16.63325471698113, + "grad_norm": 3.4984960556030273, + "learning_rate": 8.389394190983446e-07, + "loss": 0.2602, + "num_input_tokens_seen": 18412200, + "step": 28210 + }, + { + "epoch": 16.63620283018868, + "grad_norm": 3.7801666259765625, + "learning_rate": 8.37513509648642e-07, + "loss": 0.2409, + "num_input_tokens_seen": 18415336, + "step": 28215 + }, + { + "epoch": 16.639150943396228, + "grad_norm": 5.987263202667236, + "learning_rate": 8.360887022351771e-07, + "loss": 0.4233, + "num_input_tokens_seen": 18418728, + "step": 28220 + }, + { + "epoch": 16.642099056603772, + "grad_norm": 2.0957329273223877, + "learning_rate": 8.346649972351739e-07, + "loss": 0.3023, + "num_input_tokens_seen": 18425896, + "step": 28225 + }, + { + "epoch": 16.64504716981132, + "grad_norm": 3.4638280868530273, + "learning_rate": 8.33242395025563e-07, + "loss": 0.3672, + "num_input_tokens_seen": 18428552, + "step": 28230 + }, + { + "epoch": 16.64799528301887, + "grad_norm": 3.376800298690796, + "learning_rate": 8.318208959829871e-07, + "loss": 0.3192, + "num_input_tokens_seen": 18431176, + "step": 28235 + }, + { + "epoch": 16.650943396226417, + "grad_norm": 4.078437328338623, + "learning_rate": 8.304005004837929e-07, + "loss": 0.3292, + "num_input_tokens_seen": 18434760, + "step": 28240 + }, + { + "epoch": 16.65389150943396, + "grad_norm": 2.1647026538848877, + "learning_rate": 8.289812089040344e-07, + "loss": 0.3402, + "num_input_tokens_seen": 18437896, + "step": 28245 + }, + { + "epoch": 16.65683962264151, + "grad_norm": 3.9181089401245117, + "learning_rate": 8.275630216194785e-07, + "loss": 0.291, + "num_input_tokens_seen": 18441064, + "step": 28250 + }, + { + "epoch": 16.659787735849058, + "grad_norm": 2.9407787322998047, + "learning_rate": 8.261459390055948e-07, + "loss": 0.3331, + "num_input_tokens_seen": 18443560, + "step": 28255 + }, + { + "epoch": 16.662735849056602, + "grad_norm": 5.153355598449707, + "learning_rate": 8.24729961437562e-07, + "loss": 0.3333, + "num_input_tokens_seen": 18446632, + "step": 28260 + }, + { + "epoch": 16.66568396226415, + "grad_norm": 2.584152936935425, + "learning_rate": 8.233150892902653e-07, + "loss": 0.3638, + "num_input_tokens_seen": 18450088, + "step": 28265 + }, + { + "epoch": 16.6686320754717, + "grad_norm": 2.558051347732544, + "learning_rate": 8.219013229383005e-07, + "loss": 0.2796, + "num_input_tokens_seen": 18454152, + "step": 28270 + }, + { + "epoch": 16.671580188679247, + "grad_norm": 4.4236555099487305, + "learning_rate": 8.204886627559666e-07, + "loss": 0.3369, + "num_input_tokens_seen": 18456840, + "step": 28275 + }, + { + "epoch": 16.67452830188679, + "grad_norm": 2.3827240467071533, + "learning_rate": 8.190771091172722e-07, + "loss": 0.2469, + "num_input_tokens_seen": 18460168, + "step": 28280 + }, + { + "epoch": 16.67747641509434, + "grad_norm": 7.2898406982421875, + "learning_rate": 8.176666623959323e-07, + "loss": 0.5131, + "num_input_tokens_seen": 18463464, + "step": 28285 + }, + { + "epoch": 16.680424528301888, + "grad_norm": 5.622857570648193, + "learning_rate": 8.162573229653681e-07, + "loss": 0.2696, + "num_input_tokens_seen": 18466952, + "step": 28290 + }, + { + "epoch": 16.683372641509433, + "grad_norm": 4.583674907684326, + "learning_rate": 8.148490911987073e-07, + "loss": 0.2215, + "num_input_tokens_seen": 18469448, + "step": 28295 + }, + { + "epoch": 16.68632075471698, + "grad_norm": 5.918069839477539, + "learning_rate": 8.134419674687876e-07, + "loss": 0.4376, + "num_input_tokens_seen": 18473000, + "step": 28300 + }, + { + "epoch": 16.68926886792453, + "grad_norm": 4.1532511711120605, + "learning_rate": 8.120359521481502e-07, + "loss": 0.3218, + "num_input_tokens_seen": 18475848, + "step": 28305 + }, + { + "epoch": 16.692216981132077, + "grad_norm": 3.7391672134399414, + "learning_rate": 8.106310456090438e-07, + "loss": 0.2893, + "num_input_tokens_seen": 18479848, + "step": 28310 + }, + { + "epoch": 16.69516509433962, + "grad_norm": 3.1562418937683105, + "learning_rate": 8.092272482234231e-07, + "loss": 0.3681, + "num_input_tokens_seen": 18484264, + "step": 28315 + }, + { + "epoch": 16.69811320754717, + "grad_norm": 3.536468982696533, + "learning_rate": 8.078245603629486e-07, + "loss": 0.2736, + "num_input_tokens_seen": 18487080, + "step": 28320 + }, + { + "epoch": 16.701061320754718, + "grad_norm": 2.602874517440796, + "learning_rate": 8.0642298239899e-07, + "loss": 0.2139, + "num_input_tokens_seen": 18490248, + "step": 28325 + }, + { + "epoch": 16.704009433962263, + "grad_norm": 2.5799262523651123, + "learning_rate": 8.050225147026202e-07, + "loss": 0.3513, + "num_input_tokens_seen": 18494504, + "step": 28330 + }, + { + "epoch": 16.70695754716981, + "grad_norm": 7.087551593780518, + "learning_rate": 8.03623157644619e-07, + "loss": 0.4131, + "num_input_tokens_seen": 18498120, + "step": 28335 + }, + { + "epoch": 16.70990566037736, + "grad_norm": 5.125203609466553, + "learning_rate": 8.022249115954728e-07, + "loss": 0.2537, + "num_input_tokens_seen": 18501768, + "step": 28340 + }, + { + "epoch": 16.712853773584907, + "grad_norm": 3.553215980529785, + "learning_rate": 8.008277769253709e-07, + "loss": 0.2565, + "num_input_tokens_seen": 18504136, + "step": 28345 + }, + { + "epoch": 16.715801886792452, + "grad_norm": 5.535646438598633, + "learning_rate": 7.994317540042135e-07, + "loss": 0.3364, + "num_input_tokens_seen": 18507208, + "step": 28350 + }, + { + "epoch": 16.71875, + "grad_norm": 3.334515333175659, + "learning_rate": 7.980368432016017e-07, + "loss": 0.1977, + "num_input_tokens_seen": 18510472, + "step": 28355 + }, + { + "epoch": 16.721698113207548, + "grad_norm": 4.659951686859131, + "learning_rate": 7.966430448868461e-07, + "loss": 0.313, + "num_input_tokens_seen": 18513064, + "step": 28360 + }, + { + "epoch": 16.724646226415093, + "grad_norm": 3.0508806705474854, + "learning_rate": 7.952503594289601e-07, + "loss": 0.3984, + "num_input_tokens_seen": 18515624, + "step": 28365 + }, + { + "epoch": 16.72759433962264, + "grad_norm": 6.148863315582275, + "learning_rate": 7.93858787196663e-07, + "loss": 0.3171, + "num_input_tokens_seen": 18518888, + "step": 28370 + }, + { + "epoch": 16.73054245283019, + "grad_norm": 7.815550327301025, + "learning_rate": 7.92468328558379e-07, + "loss": 0.3995, + "num_input_tokens_seen": 18521416, + "step": 28375 + }, + { + "epoch": 16.733490566037737, + "grad_norm": 4.067500591278076, + "learning_rate": 7.910789838822386e-07, + "loss": 0.392, + "num_input_tokens_seen": 18524712, + "step": 28380 + }, + { + "epoch": 16.736438679245282, + "grad_norm": 5.509260177612305, + "learning_rate": 7.89690753536076e-07, + "loss": 0.4172, + "num_input_tokens_seen": 18527560, + "step": 28385 + }, + { + "epoch": 16.73938679245283, + "grad_norm": 4.308719635009766, + "learning_rate": 7.883036378874326e-07, + "loss": 0.3907, + "num_input_tokens_seen": 18530696, + "step": 28390 + }, + { + "epoch": 16.74233490566038, + "grad_norm": 3.7203173637390137, + "learning_rate": 7.86917637303552e-07, + "loss": 0.2807, + "num_input_tokens_seen": 18534280, + "step": 28395 + }, + { + "epoch": 16.745283018867923, + "grad_norm": 3.3045477867126465, + "learning_rate": 7.855327521513851e-07, + "loss": 0.3088, + "num_input_tokens_seen": 18536648, + "step": 28400 + }, + { + "epoch": 16.74823113207547, + "grad_norm": 3.9346742630004883, + "learning_rate": 7.841489827975851e-07, + "loss": 0.2986, + "num_input_tokens_seen": 18539496, + "step": 28405 + }, + { + "epoch": 16.75117924528302, + "grad_norm": 6.151522636413574, + "learning_rate": 7.827663296085109e-07, + "loss": 0.3336, + "num_input_tokens_seen": 18541896, + "step": 28410 + }, + { + "epoch": 16.754127358490567, + "grad_norm": 4.613429069519043, + "learning_rate": 7.813847929502255e-07, + "loss": 0.336, + "num_input_tokens_seen": 18544840, + "step": 28415 + }, + { + "epoch": 16.757075471698112, + "grad_norm": 1.8258585929870605, + "learning_rate": 7.800043731884982e-07, + "loss": 0.2786, + "num_input_tokens_seen": 18548808, + "step": 28420 + }, + { + "epoch": 16.76002358490566, + "grad_norm": 2.6319079399108887, + "learning_rate": 7.786250706888005e-07, + "loss": 0.3584, + "num_input_tokens_seen": 18552040, + "step": 28425 + }, + { + "epoch": 16.76297169811321, + "grad_norm": 2.326843500137329, + "learning_rate": 7.772468858163085e-07, + "loss": 0.2194, + "num_input_tokens_seen": 18555176, + "step": 28430 + }, + { + "epoch": 16.765919811320753, + "grad_norm": 4.060580730438232, + "learning_rate": 7.758698189359026e-07, + "loss": 0.2222, + "num_input_tokens_seen": 18561032, + "step": 28435 + }, + { + "epoch": 16.7688679245283, + "grad_norm": 4.190971374511719, + "learning_rate": 7.744938704121658e-07, + "loss": 0.2306, + "num_input_tokens_seen": 18563848, + "step": 28440 + }, + { + "epoch": 16.77181603773585, + "grad_norm": 2.9407505989074707, + "learning_rate": 7.731190406093892e-07, + "loss": 0.2519, + "num_input_tokens_seen": 18567144, + "step": 28445 + }, + { + "epoch": 16.774764150943398, + "grad_norm": 3.611067533493042, + "learning_rate": 7.717453298915617e-07, + "loss": 0.526, + "num_input_tokens_seen": 18569768, + "step": 28450 + }, + { + "epoch": 16.777712264150942, + "grad_norm": 3.9767723083496094, + "learning_rate": 7.703727386223825e-07, + "loss": 0.2791, + "num_input_tokens_seen": 18572328, + "step": 28455 + }, + { + "epoch": 16.78066037735849, + "grad_norm": 5.371555805206299, + "learning_rate": 7.690012671652491e-07, + "loss": 0.3235, + "num_input_tokens_seen": 18575272, + "step": 28460 + }, + { + "epoch": 16.78360849056604, + "grad_norm": 3.3217289447784424, + "learning_rate": 7.676309158832651e-07, + "loss": 0.3406, + "num_input_tokens_seen": 18578152, + "step": 28465 + }, + { + "epoch": 16.786556603773583, + "grad_norm": 4.040096282958984, + "learning_rate": 7.662616851392362e-07, + "loss": 0.3584, + "num_input_tokens_seen": 18581608, + "step": 28470 + }, + { + "epoch": 16.78950471698113, + "grad_norm": 7.39586877822876, + "learning_rate": 7.648935752956732e-07, + "loss": 0.5609, + "num_input_tokens_seen": 18585512, + "step": 28475 + }, + { + "epoch": 16.79245283018868, + "grad_norm": 2.883486270904541, + "learning_rate": 7.635265867147867e-07, + "loss": 0.3372, + "num_input_tokens_seen": 18587912, + "step": 28480 + }, + { + "epoch": 16.795400943396228, + "grad_norm": 3.3772594928741455, + "learning_rate": 7.621607197584963e-07, + "loss": 0.3827, + "num_input_tokens_seen": 18591464, + "step": 28485 + }, + { + "epoch": 16.798349056603772, + "grad_norm": 3.912789821624756, + "learning_rate": 7.607959747884186e-07, + "loss": 0.454, + "num_input_tokens_seen": 18594312, + "step": 28490 + }, + { + "epoch": 16.80129716981132, + "grad_norm": 4.989803314208984, + "learning_rate": 7.594323521658769e-07, + "loss": 0.3148, + "num_input_tokens_seen": 18597256, + "step": 28495 + }, + { + "epoch": 16.80424528301887, + "grad_norm": 3.8708603382110596, + "learning_rate": 7.580698522518958e-07, + "loss": 0.3437, + "num_input_tokens_seen": 18600168, + "step": 28500 + }, + { + "epoch": 16.807193396226417, + "grad_norm": 4.615939140319824, + "learning_rate": 7.567084754072035e-07, + "loss": 0.2778, + "num_input_tokens_seen": 18603304, + "step": 28505 + }, + { + "epoch": 16.81014150943396, + "grad_norm": 6.80972146987915, + "learning_rate": 7.553482219922282e-07, + "loss": 0.2765, + "num_input_tokens_seen": 18606312, + "step": 28510 + }, + { + "epoch": 16.81308962264151, + "grad_norm": 2.9495906829833984, + "learning_rate": 7.539890923671061e-07, + "loss": 0.2633, + "num_input_tokens_seen": 18609576, + "step": 28515 + }, + { + "epoch": 16.816037735849058, + "grad_norm": 5.903151988983154, + "learning_rate": 7.526310868916708e-07, + "loss": 0.2989, + "num_input_tokens_seen": 18612904, + "step": 28520 + }, + { + "epoch": 16.818985849056602, + "grad_norm": 2.3333492279052734, + "learning_rate": 7.512742059254602e-07, + "loss": 0.2481, + "num_input_tokens_seen": 18616616, + "step": 28525 + }, + { + "epoch": 16.82193396226415, + "grad_norm": 3.5993716716766357, + "learning_rate": 7.499184498277151e-07, + "loss": 0.2846, + "num_input_tokens_seen": 18620008, + "step": 28530 + }, + { + "epoch": 16.8248820754717, + "grad_norm": 1.9620471000671387, + "learning_rate": 7.485638189573758e-07, + "loss": 0.2309, + "num_input_tokens_seen": 18622984, + "step": 28535 + }, + { + "epoch": 16.827830188679247, + "grad_norm": 3.4604578018188477, + "learning_rate": 7.472103136730891e-07, + "loss": 0.3069, + "num_input_tokens_seen": 18626024, + "step": 28540 + }, + { + "epoch": 16.83077830188679, + "grad_norm": 4.1467084884643555, + "learning_rate": 7.458579343331996e-07, + "loss": 0.4178, + "num_input_tokens_seen": 18629576, + "step": 28545 + }, + { + "epoch": 16.83372641509434, + "grad_norm": 4.977681636810303, + "learning_rate": 7.445066812957569e-07, + "loss": 0.26, + "num_input_tokens_seen": 18632744, + "step": 28550 + }, + { + "epoch": 16.836674528301888, + "grad_norm": 3.6723358631134033, + "learning_rate": 7.43156554918511e-07, + "loss": 0.3094, + "num_input_tokens_seen": 18635784, + "step": 28555 + }, + { + "epoch": 16.839622641509433, + "grad_norm": 5.035487651824951, + "learning_rate": 7.418075555589132e-07, + "loss": 0.3556, + "num_input_tokens_seen": 18639464, + "step": 28560 + }, + { + "epoch": 16.84257075471698, + "grad_norm": 4.815647602081299, + "learning_rate": 7.404596835741168e-07, + "loss": 0.4945, + "num_input_tokens_seen": 18642760, + "step": 28565 + }, + { + "epoch": 16.84551886792453, + "grad_norm": 12.751043319702148, + "learning_rate": 7.391129393209751e-07, + "loss": 0.3026, + "num_input_tokens_seen": 18646536, + "step": 28570 + }, + { + "epoch": 16.848466981132077, + "grad_norm": 4.579164981842041, + "learning_rate": 7.377673231560478e-07, + "loss": 0.2912, + "num_input_tokens_seen": 18649704, + "step": 28575 + }, + { + "epoch": 16.85141509433962, + "grad_norm": 4.935809135437012, + "learning_rate": 7.364228354355907e-07, + "loss": 0.3859, + "num_input_tokens_seen": 18652424, + "step": 28580 + }, + { + "epoch": 16.85436320754717, + "grad_norm": 4.620152473449707, + "learning_rate": 7.350794765155627e-07, + "loss": 0.3358, + "num_input_tokens_seen": 18655368, + "step": 28585 + }, + { + "epoch": 16.857311320754718, + "grad_norm": 8.465409278869629, + "learning_rate": 7.337372467516246e-07, + "loss": 0.3491, + "num_input_tokens_seen": 18658344, + "step": 28590 + }, + { + "epoch": 16.860259433962263, + "grad_norm": 2.5904970169067383, + "learning_rate": 7.323961464991369e-07, + "loss": 0.3281, + "num_input_tokens_seen": 18661448, + "step": 28595 + }, + { + "epoch": 16.86320754716981, + "grad_norm": 2.090069532394409, + "learning_rate": 7.310561761131601e-07, + "loss": 0.2802, + "num_input_tokens_seen": 18664648, + "step": 28600 + }, + { + "epoch": 16.86615566037736, + "grad_norm": 5.940135955810547, + "learning_rate": 7.297173359484605e-07, + "loss": 0.3634, + "num_input_tokens_seen": 18667400, + "step": 28605 + }, + { + "epoch": 16.869103773584907, + "grad_norm": 3.509368896484375, + "learning_rate": 7.283796263595e-07, + "loss": 0.2503, + "num_input_tokens_seen": 18670440, + "step": 28610 + }, + { + "epoch": 16.872051886792452, + "grad_norm": 4.251274585723877, + "learning_rate": 7.270430477004431e-07, + "loss": 0.522, + "num_input_tokens_seen": 18673032, + "step": 28615 + }, + { + "epoch": 16.875, + "grad_norm": 4.0522894859313965, + "learning_rate": 7.257076003251545e-07, + "loss": 0.447, + "num_input_tokens_seen": 18677192, + "step": 28620 + }, + { + "epoch": 16.877948113207548, + "grad_norm": 2.482048511505127, + "learning_rate": 7.243732845871998e-07, + "loss": 0.2254, + "num_input_tokens_seen": 18682280, + "step": 28625 + }, + { + "epoch": 16.880896226415093, + "grad_norm": 2.4756174087524414, + "learning_rate": 7.230401008398441e-07, + "loss": 0.3509, + "num_input_tokens_seen": 18685736, + "step": 28630 + }, + { + "epoch": 16.88384433962264, + "grad_norm": 2.6911680698394775, + "learning_rate": 7.217080494360546e-07, + "loss": 0.334, + "num_input_tokens_seen": 18689032, + "step": 28635 + }, + { + "epoch": 16.88679245283019, + "grad_norm": 3.798809051513672, + "learning_rate": 7.20377130728498e-07, + "loss": 0.3288, + "num_input_tokens_seen": 18692552, + "step": 28640 + }, + { + "epoch": 16.889740566037737, + "grad_norm": 4.084588050842285, + "learning_rate": 7.190473450695407e-07, + "loss": 0.28, + "num_input_tokens_seen": 18695912, + "step": 28645 + }, + { + "epoch": 16.892688679245282, + "grad_norm": 2.770679473876953, + "learning_rate": 7.177186928112484e-07, + "loss": 0.289, + "num_input_tokens_seen": 18698728, + "step": 28650 + }, + { + "epoch": 16.89563679245283, + "grad_norm": 3.297852039337158, + "learning_rate": 7.163911743053876e-07, + "loss": 0.3824, + "num_input_tokens_seen": 18701864, + "step": 28655 + }, + { + "epoch": 16.89858490566038, + "grad_norm": 5.296316146850586, + "learning_rate": 7.150647899034252e-07, + "loss": 0.2998, + "num_input_tokens_seen": 18704424, + "step": 28660 + }, + { + "epoch": 16.901533018867923, + "grad_norm": 4.755214214324951, + "learning_rate": 7.13739539956525e-07, + "loss": 0.3218, + "num_input_tokens_seen": 18707720, + "step": 28665 + }, + { + "epoch": 16.90448113207547, + "grad_norm": 4.040946006774902, + "learning_rate": 7.124154248155562e-07, + "loss": 0.3398, + "num_input_tokens_seen": 18713576, + "step": 28670 + }, + { + "epoch": 16.90742924528302, + "grad_norm": 3.970780849456787, + "learning_rate": 7.110924448310813e-07, + "loss": 0.3325, + "num_input_tokens_seen": 18716232, + "step": 28675 + }, + { + "epoch": 16.910377358490567, + "grad_norm": 2.820988893508911, + "learning_rate": 7.097706003533666e-07, + "loss": 0.2167, + "num_input_tokens_seen": 18719432, + "step": 28680 + }, + { + "epoch": 16.913325471698112, + "grad_norm": 2.992216110229492, + "learning_rate": 7.084498917323751e-07, + "loss": 0.3636, + "num_input_tokens_seen": 18722664, + "step": 28685 + }, + { + "epoch": 16.91627358490566, + "grad_norm": 3.8460693359375, + "learning_rate": 7.071303193177698e-07, + "loss": 0.3266, + "num_input_tokens_seen": 18725992, + "step": 28690 + }, + { + "epoch": 16.91922169811321, + "grad_norm": 2.9601423740386963, + "learning_rate": 7.058118834589133e-07, + "loss": 0.3559, + "num_input_tokens_seen": 18728520, + "step": 28695 + }, + { + "epoch": 16.922169811320753, + "grad_norm": 3.2146620750427246, + "learning_rate": 7.044945845048684e-07, + "loss": 0.3259, + "num_input_tokens_seen": 18732168, + "step": 28700 + }, + { + "epoch": 16.9251179245283, + "grad_norm": 4.605044364929199, + "learning_rate": 7.031784228043948e-07, + "loss": 0.2902, + "num_input_tokens_seen": 18736264, + "step": 28705 + }, + { + "epoch": 16.92806603773585, + "grad_norm": 2.1182103157043457, + "learning_rate": 7.01863398705952e-07, + "loss": 0.2599, + "num_input_tokens_seen": 18739144, + "step": 28710 + }, + { + "epoch": 16.931014150943398, + "grad_norm": 4.951053142547607, + "learning_rate": 7.005495125576983e-07, + "loss": 0.3527, + "num_input_tokens_seen": 18745096, + "step": 28715 + }, + { + "epoch": 16.933962264150942, + "grad_norm": 3.7235867977142334, + "learning_rate": 6.99236764707491e-07, + "loss": 0.2285, + "num_input_tokens_seen": 18747432, + "step": 28720 + }, + { + "epoch": 16.93691037735849, + "grad_norm": 4.073049545288086, + "learning_rate": 6.979251555028843e-07, + "loss": 0.2431, + "num_input_tokens_seen": 18751080, + "step": 28725 + }, + { + "epoch": 16.93985849056604, + "grad_norm": 4.604541301727295, + "learning_rate": 6.966146852911332e-07, + "loss": 0.2945, + "num_input_tokens_seen": 18753672, + "step": 28730 + }, + { + "epoch": 16.942806603773583, + "grad_norm": 3.618743419647217, + "learning_rate": 6.953053544191923e-07, + "loss": 0.2463, + "num_input_tokens_seen": 18757032, + "step": 28735 + }, + { + "epoch": 16.94575471698113, + "grad_norm": 3.5476527214050293, + "learning_rate": 6.939971632337111e-07, + "loss": 0.3649, + "num_input_tokens_seen": 18760744, + "step": 28740 + }, + { + "epoch": 16.94870283018868, + "grad_norm": 5.173311233520508, + "learning_rate": 6.926901120810387e-07, + "loss": 0.3169, + "num_input_tokens_seen": 18764072, + "step": 28745 + }, + { + "epoch": 16.951650943396228, + "grad_norm": 3.281554937362671, + "learning_rate": 6.91384201307222e-07, + "loss": 0.2927, + "num_input_tokens_seen": 18767496, + "step": 28750 + }, + { + "epoch": 16.954599056603772, + "grad_norm": 3.33485746383667, + "learning_rate": 6.900794312580078e-07, + "loss": 0.4488, + "num_input_tokens_seen": 18771240, + "step": 28755 + }, + { + "epoch": 16.95754716981132, + "grad_norm": 4.5220842361450195, + "learning_rate": 6.887758022788377e-07, + "loss": 0.2602, + "num_input_tokens_seen": 18774312, + "step": 28760 + }, + { + "epoch": 16.96049528301887, + "grad_norm": 2.8788657188415527, + "learning_rate": 6.874733147148549e-07, + "loss": 0.2412, + "num_input_tokens_seen": 18779272, + "step": 28765 + }, + { + "epoch": 16.963443396226417, + "grad_norm": 3.564028024673462, + "learning_rate": 6.861719689108987e-07, + "loss": 0.311, + "num_input_tokens_seen": 18782408, + "step": 28770 + }, + { + "epoch": 16.96639150943396, + "grad_norm": 3.9503748416900635, + "learning_rate": 6.84871765211505e-07, + "loss": 0.3793, + "num_input_tokens_seen": 18785096, + "step": 28775 + }, + { + "epoch": 16.96933962264151, + "grad_norm": 4.933022499084473, + "learning_rate": 6.835727039609086e-07, + "loss": 0.3615, + "num_input_tokens_seen": 18788680, + "step": 28780 + }, + { + "epoch": 16.972287735849058, + "grad_norm": 3.2679145336151123, + "learning_rate": 6.822747855030415e-07, + "loss": 0.38, + "num_input_tokens_seen": 18791592, + "step": 28785 + }, + { + "epoch": 16.975235849056602, + "grad_norm": 3.2902326583862305, + "learning_rate": 6.809780101815322e-07, + "loss": 0.4038, + "num_input_tokens_seen": 18794248, + "step": 28790 + }, + { + "epoch": 16.97818396226415, + "grad_norm": 3.654723644256592, + "learning_rate": 6.796823783397099e-07, + "loss": 0.2689, + "num_input_tokens_seen": 18797032, + "step": 28795 + }, + { + "epoch": 16.9811320754717, + "grad_norm": 4.483514308929443, + "learning_rate": 6.783878903205976e-07, + "loss": 0.3077, + "num_input_tokens_seen": 18800552, + "step": 28800 + }, + { + "epoch": 16.984080188679247, + "grad_norm": 7.218395233154297, + "learning_rate": 6.77094546466916e-07, + "loss": 0.3135, + "num_input_tokens_seen": 18803304, + "step": 28805 + }, + { + "epoch": 16.98702830188679, + "grad_norm": 4.23872709274292, + "learning_rate": 6.758023471210845e-07, + "loss": 0.2349, + "num_input_tokens_seen": 18806440, + "step": 28810 + }, + { + "epoch": 16.98997641509434, + "grad_norm": 3.07650089263916, + "learning_rate": 6.745112926252162e-07, + "loss": 0.3146, + "num_input_tokens_seen": 18809512, + "step": 28815 + }, + { + "epoch": 16.992924528301888, + "grad_norm": 2.9675638675689697, + "learning_rate": 6.732213833211265e-07, + "loss": 0.3026, + "num_input_tokens_seen": 18813544, + "step": 28820 + }, + { + "epoch": 16.995872641509433, + "grad_norm": 3.477440118789673, + "learning_rate": 6.719326195503218e-07, + "loss": 0.3043, + "num_input_tokens_seen": 18816200, + "step": 28825 + }, + { + "epoch": 16.99882075471698, + "grad_norm": 6.5422821044921875, + "learning_rate": 6.706450016540094e-07, + "loss": 0.3289, + "num_input_tokens_seen": 18818120, + "step": 28830 + }, + { + "epoch": 17.00176886792453, + "grad_norm": 5.277149677276611, + "learning_rate": 6.69358529973092e-07, + "loss": 0.4684, + "num_input_tokens_seen": 18820560, + "step": 28835 + }, + { + "epoch": 17.004716981132077, + "grad_norm": 2.618361711502075, + "learning_rate": 6.680732048481681e-07, + "loss": 0.2389, + "num_input_tokens_seen": 18823376, + "step": 28840 + }, + { + "epoch": 17.00766509433962, + "grad_norm": 6.985800266265869, + "learning_rate": 6.667890266195321e-07, + "loss": 0.2828, + "num_input_tokens_seen": 18826320, + "step": 28845 + }, + { + "epoch": 17.01061320754717, + "grad_norm": 3.6116225719451904, + "learning_rate": 6.655059956271759e-07, + "loss": 0.4073, + "num_input_tokens_seen": 18829392, + "step": 28850 + }, + { + "epoch": 17.013561320754718, + "grad_norm": 3.4441182613372803, + "learning_rate": 6.642241122107884e-07, + "loss": 0.2946, + "num_input_tokens_seen": 18832880, + "step": 28855 + }, + { + "epoch": 17.016509433962263, + "grad_norm": 5.60723352432251, + "learning_rate": 6.629433767097537e-07, + "loss": 0.4036, + "num_input_tokens_seen": 18835600, + "step": 28860 + }, + { + "epoch": 17.01945754716981, + "grad_norm": 5.501210689544678, + "learning_rate": 6.616637894631517e-07, + "loss": 0.2604, + "num_input_tokens_seen": 18838384, + "step": 28865 + }, + { + "epoch": 17.02240566037736, + "grad_norm": 2.9047465324401855, + "learning_rate": 6.603853508097591e-07, + "loss": 0.3213, + "num_input_tokens_seen": 18841360, + "step": 28870 + }, + { + "epoch": 17.025353773584907, + "grad_norm": 4.209240436553955, + "learning_rate": 6.591080610880468e-07, + "loss": 0.2987, + "num_input_tokens_seen": 18844464, + "step": 28875 + }, + { + "epoch": 17.028301886792452, + "grad_norm": 3.9276926517486572, + "learning_rate": 6.578319206361828e-07, + "loss": 0.2957, + "num_input_tokens_seen": 18847632, + "step": 28880 + }, + { + "epoch": 17.03125, + "grad_norm": 4.21993350982666, + "learning_rate": 6.565569297920327e-07, + "loss": 0.2527, + "num_input_tokens_seen": 18850832, + "step": 28885 + }, + { + "epoch": 17.034198113207548, + "grad_norm": 5.8264479637146, + "learning_rate": 6.552830888931544e-07, + "loss": 0.3308, + "num_input_tokens_seen": 18855088, + "step": 28890 + }, + { + "epoch": 17.037146226415093, + "grad_norm": 4.929085731506348, + "learning_rate": 6.540103982768031e-07, + "loss": 0.3567, + "num_input_tokens_seen": 18859024, + "step": 28895 + }, + { + "epoch": 17.04009433962264, + "grad_norm": 4.483932971954346, + "learning_rate": 6.527388582799293e-07, + "loss": 0.3168, + "num_input_tokens_seen": 18862768, + "step": 28900 + }, + { + "epoch": 17.04304245283019, + "grad_norm": 3.3115944862365723, + "learning_rate": 6.514684692391782e-07, + "loss": 0.336, + "num_input_tokens_seen": 18866448, + "step": 28905 + }, + { + "epoch": 17.045990566037737, + "grad_norm": 2.215034246444702, + "learning_rate": 6.501992314908895e-07, + "loss": 0.28, + "num_input_tokens_seen": 18869200, + "step": 28910 + }, + { + "epoch": 17.048938679245282, + "grad_norm": 4.784584999084473, + "learning_rate": 6.489311453711017e-07, + "loss": 0.2487, + "num_input_tokens_seen": 18871792, + "step": 28915 + }, + { + "epoch": 17.05188679245283, + "grad_norm": 3.3275740146636963, + "learning_rate": 6.476642112155457e-07, + "loss": 0.3703, + "num_input_tokens_seen": 18875824, + "step": 28920 + }, + { + "epoch": 17.05483490566038, + "grad_norm": 3.625434637069702, + "learning_rate": 6.463984293596476e-07, + "loss": 0.3144, + "num_input_tokens_seen": 18878384, + "step": 28925 + }, + { + "epoch": 17.057783018867923, + "grad_norm": 7.646711349487305, + "learning_rate": 6.451338001385282e-07, + "loss": 0.2724, + "num_input_tokens_seen": 18881584, + "step": 28930 + }, + { + "epoch": 17.06073113207547, + "grad_norm": 3.476174831390381, + "learning_rate": 6.438703238870037e-07, + "loss": 0.3482, + "num_input_tokens_seen": 18885808, + "step": 28935 + }, + { + "epoch": 17.06367924528302, + "grad_norm": 5.961153507232666, + "learning_rate": 6.426080009395846e-07, + "loss": 0.2442, + "num_input_tokens_seen": 18889680, + "step": 28940 + }, + { + "epoch": 17.066627358490567, + "grad_norm": 3.243980884552002, + "learning_rate": 6.413468316304755e-07, + "loss": 0.3201, + "num_input_tokens_seen": 18893200, + "step": 28945 + }, + { + "epoch": 17.069575471698112, + "grad_norm": 3.2043697834014893, + "learning_rate": 6.400868162935786e-07, + "loss": 0.298, + "num_input_tokens_seen": 18896496, + "step": 28950 + }, + { + "epoch": 17.07252358490566, + "grad_norm": 5.6146321296691895, + "learning_rate": 6.388279552624877e-07, + "loss": 0.3733, + "num_input_tokens_seen": 18900528, + "step": 28955 + }, + { + "epoch": 17.07547169811321, + "grad_norm": 3.801034927368164, + "learning_rate": 6.37570248870491e-07, + "loss": 0.3959, + "num_input_tokens_seen": 18903664, + "step": 28960 + }, + { + "epoch": 17.078419811320753, + "grad_norm": 2.3925743103027344, + "learning_rate": 6.363136974505718e-07, + "loss": 0.2677, + "num_input_tokens_seen": 18907216, + "step": 28965 + }, + { + "epoch": 17.0813679245283, + "grad_norm": 4.378587245941162, + "learning_rate": 6.350583013354078e-07, + "loss": 0.3265, + "num_input_tokens_seen": 18910992, + "step": 28970 + }, + { + "epoch": 17.08431603773585, + "grad_norm": 3.958256483078003, + "learning_rate": 6.338040608573693e-07, + "loss": 0.3975, + "num_input_tokens_seen": 18914736, + "step": 28975 + }, + { + "epoch": 17.087264150943398, + "grad_norm": 3.798232078552246, + "learning_rate": 6.325509763485238e-07, + "loss": 0.4356, + "num_input_tokens_seen": 18917680, + "step": 28980 + }, + { + "epoch": 17.090212264150942, + "grad_norm": 3.3746109008789062, + "learning_rate": 6.312990481406301e-07, + "loss": 0.422, + "num_input_tokens_seen": 18921104, + "step": 28985 + }, + { + "epoch": 17.09316037735849, + "grad_norm": 8.453032493591309, + "learning_rate": 6.300482765651411e-07, + "loss": 0.3194, + "num_input_tokens_seen": 18923632, + "step": 28990 + }, + { + "epoch": 17.09610849056604, + "grad_norm": 4.32177734375, + "learning_rate": 6.28798661953205e-07, + "loss": 0.3847, + "num_input_tokens_seen": 18927440, + "step": 28995 + }, + { + "epoch": 17.099056603773583, + "grad_norm": 2.9792327880859375, + "learning_rate": 6.275502046356618e-07, + "loss": 0.3914, + "num_input_tokens_seen": 18930896, + "step": 29000 + }, + { + "epoch": 17.10200471698113, + "grad_norm": 5.194839954376221, + "learning_rate": 6.263029049430447e-07, + "loss": 0.4015, + "num_input_tokens_seen": 18933680, + "step": 29005 + }, + { + "epoch": 17.10495283018868, + "grad_norm": 3.6355044841766357, + "learning_rate": 6.250567632055832e-07, + "loss": 0.2719, + "num_input_tokens_seen": 18937200, + "step": 29010 + }, + { + "epoch": 17.107900943396228, + "grad_norm": 3.861161470413208, + "learning_rate": 6.238117797532e-07, + "loss": 0.2425, + "num_input_tokens_seen": 18940976, + "step": 29015 + }, + { + "epoch": 17.110849056603772, + "grad_norm": 3.0211374759674072, + "learning_rate": 6.225679549155083e-07, + "loss": 0.2221, + "num_input_tokens_seen": 18943472, + "step": 29020 + }, + { + "epoch": 17.11379716981132, + "grad_norm": 2.980250358581543, + "learning_rate": 6.213252890218163e-07, + "loss": 0.4864, + "num_input_tokens_seen": 18946608, + "step": 29025 + }, + { + "epoch": 17.11674528301887, + "grad_norm": 4.8874592781066895, + "learning_rate": 6.200837824011247e-07, + "loss": 0.2447, + "num_input_tokens_seen": 18949232, + "step": 29030 + }, + { + "epoch": 17.119693396226417, + "grad_norm": 2.636491060256958, + "learning_rate": 6.188434353821282e-07, + "loss": 0.283, + "num_input_tokens_seen": 18953040, + "step": 29035 + }, + { + "epoch": 17.12264150943396, + "grad_norm": 3.269869327545166, + "learning_rate": 6.176042482932132e-07, + "loss": 0.3229, + "num_input_tokens_seen": 18956144, + "step": 29040 + }, + { + "epoch": 17.12558962264151, + "grad_norm": 3.9316980838775635, + "learning_rate": 6.163662214624616e-07, + "loss": 0.3138, + "num_input_tokens_seen": 18959312, + "step": 29045 + }, + { + "epoch": 17.128537735849058, + "grad_norm": 3.5140061378479004, + "learning_rate": 6.151293552176451e-07, + "loss": 0.29, + "num_input_tokens_seen": 18962384, + "step": 29050 + }, + { + "epoch": 17.131485849056602, + "grad_norm": 2.257054090499878, + "learning_rate": 6.138936498862291e-07, + "loss": 0.3089, + "num_input_tokens_seen": 18965936, + "step": 29055 + }, + { + "epoch": 17.13443396226415, + "grad_norm": 4.3367414474487305, + "learning_rate": 6.126591057953729e-07, + "loss": 0.3179, + "num_input_tokens_seen": 18969488, + "step": 29060 + }, + { + "epoch": 17.1373820754717, + "grad_norm": 7.230766296386719, + "learning_rate": 6.114257232719267e-07, + "loss": 0.2565, + "num_input_tokens_seen": 18971792, + "step": 29065 + }, + { + "epoch": 17.140330188679247, + "grad_norm": 2.9478418827056885, + "learning_rate": 6.101935026424332e-07, + "loss": 0.411, + "num_input_tokens_seen": 18975312, + "step": 29070 + }, + { + "epoch": 17.14327830188679, + "grad_norm": 3.457930088043213, + "learning_rate": 6.089624442331293e-07, + "loss": 0.3314, + "num_input_tokens_seen": 18977968, + "step": 29075 + }, + { + "epoch": 17.14622641509434, + "grad_norm": 4.382983207702637, + "learning_rate": 6.077325483699432e-07, + "loss": 0.296, + "num_input_tokens_seen": 18980432, + "step": 29080 + }, + { + "epoch": 17.149174528301888, + "grad_norm": 3.730106830596924, + "learning_rate": 6.065038153784947e-07, + "loss": 0.237, + "num_input_tokens_seen": 18983216, + "step": 29085 + }, + { + "epoch": 17.152122641509433, + "grad_norm": 3.6993281841278076, + "learning_rate": 6.052762455840955e-07, + "loss": 0.2683, + "num_input_tokens_seen": 18986672, + "step": 29090 + }, + { + "epoch": 17.15507075471698, + "grad_norm": 3.766350746154785, + "learning_rate": 6.040498393117494e-07, + "loss": 0.2812, + "num_input_tokens_seen": 18990000, + "step": 29095 + }, + { + "epoch": 17.15801886792453, + "grad_norm": 2.8253731727600098, + "learning_rate": 6.028245968861551e-07, + "loss": 0.2802, + "num_input_tokens_seen": 18994000, + "step": 29100 + }, + { + "epoch": 17.160966981132077, + "grad_norm": 4.212186336517334, + "learning_rate": 6.016005186316987e-07, + "loss": 0.3364, + "num_input_tokens_seen": 18997136, + "step": 29105 + }, + { + "epoch": 17.16391509433962, + "grad_norm": 2.9638659954071045, + "learning_rate": 6.003776048724614e-07, + "loss": 0.3032, + "num_input_tokens_seen": 19000208, + "step": 29110 + }, + { + "epoch": 17.16686320754717, + "grad_norm": 3.2039759159088135, + "learning_rate": 5.991558559322152e-07, + "loss": 0.246, + "num_input_tokens_seen": 19002736, + "step": 29115 + }, + { + "epoch": 17.169811320754718, + "grad_norm": 5.007462024688721, + "learning_rate": 5.979352721344223e-07, + "loss": 0.3158, + "num_input_tokens_seen": 19006480, + "step": 29120 + }, + { + "epoch": 17.172759433962263, + "grad_norm": 3.2318761348724365, + "learning_rate": 5.967158538022383e-07, + "loss": 0.2779, + "num_input_tokens_seen": 19010416, + "step": 29125 + }, + { + "epoch": 17.17570754716981, + "grad_norm": 6.452537536621094, + "learning_rate": 5.954976012585078e-07, + "loss": 0.2406, + "num_input_tokens_seen": 19013552, + "step": 29130 + }, + { + "epoch": 17.17865566037736, + "grad_norm": 2.605916976928711, + "learning_rate": 5.942805148257713e-07, + "loss": 0.2935, + "num_input_tokens_seen": 19017328, + "step": 29135 + }, + { + "epoch": 17.181603773584907, + "grad_norm": 3.168342113494873, + "learning_rate": 5.930645948262553e-07, + "loss": 0.4511, + "num_input_tokens_seen": 19020816, + "step": 29140 + }, + { + "epoch": 17.184551886792452, + "grad_norm": 6.5007405281066895, + "learning_rate": 5.918498415818813e-07, + "loss": 0.2451, + "num_input_tokens_seen": 19023408, + "step": 29145 + }, + { + "epoch": 17.1875, + "grad_norm": 3.4187076091766357, + "learning_rate": 5.906362554142592e-07, + "loss": 0.271, + "num_input_tokens_seen": 19026416, + "step": 29150 + }, + { + "epoch": 17.190448113207548, + "grad_norm": 4.388454437255859, + "learning_rate": 5.894238366446925e-07, + "loss": 0.2569, + "num_input_tokens_seen": 19029168, + "step": 29155 + }, + { + "epoch": 17.193396226415093, + "grad_norm": 2.6591086387634277, + "learning_rate": 5.882125855941723e-07, + "loss": 0.2518, + "num_input_tokens_seen": 19032912, + "step": 29160 + }, + { + "epoch": 17.19634433962264, + "grad_norm": 5.403634548187256, + "learning_rate": 5.870025025833842e-07, + "loss": 0.3503, + "num_input_tokens_seen": 19035472, + "step": 29165 + }, + { + "epoch": 17.19929245283019, + "grad_norm": 3.2672014236450195, + "learning_rate": 5.857935879327031e-07, + "loss": 0.2846, + "num_input_tokens_seen": 19039408, + "step": 29170 + }, + { + "epoch": 17.202240566037737, + "grad_norm": 3.047598123550415, + "learning_rate": 5.845858419621936e-07, + "loss": 0.298, + "num_input_tokens_seen": 19042384, + "step": 29175 + }, + { + "epoch": 17.205188679245282, + "grad_norm": 4.28211784362793, + "learning_rate": 5.83379264991612e-07, + "loss": 0.3781, + "num_input_tokens_seen": 19045648, + "step": 29180 + }, + { + "epoch": 17.20813679245283, + "grad_norm": 5.4828267097473145, + "learning_rate": 5.821738573404046e-07, + "loss": 0.4946, + "num_input_tokens_seen": 19049136, + "step": 29185 + }, + { + "epoch": 17.21108490566038, + "grad_norm": 3.1514363288879395, + "learning_rate": 5.80969619327707e-07, + "loss": 0.2932, + "num_input_tokens_seen": 19051728, + "step": 29190 + }, + { + "epoch": 17.214033018867923, + "grad_norm": 2.9034767150878906, + "learning_rate": 5.797665512723488e-07, + "loss": 0.2857, + "num_input_tokens_seen": 19055728, + "step": 29195 + }, + { + "epoch": 17.21698113207547, + "grad_norm": 6.199516773223877, + "learning_rate": 5.785646534928452e-07, + "loss": 0.3064, + "num_input_tokens_seen": 19058832, + "step": 29200 + }, + { + "epoch": 17.21992924528302, + "grad_norm": 3.9082412719726562, + "learning_rate": 5.77363926307406e-07, + "loss": 0.2681, + "num_input_tokens_seen": 19062608, + "step": 29205 + }, + { + "epoch": 17.222877358490567, + "grad_norm": 5.118161201477051, + "learning_rate": 5.761643700339281e-07, + "loss": 0.297, + "num_input_tokens_seen": 19066320, + "step": 29210 + }, + { + "epoch": 17.225825471698112, + "grad_norm": 3.933835744857788, + "learning_rate": 5.749659849899985e-07, + "loss": 0.3395, + "num_input_tokens_seen": 19069136, + "step": 29215 + }, + { + "epoch": 17.22877358490566, + "grad_norm": 3.4229798316955566, + "learning_rate": 5.737687714928953e-07, + "loss": 0.3699, + "num_input_tokens_seen": 19072496, + "step": 29220 + }, + { + "epoch": 17.23172169811321, + "grad_norm": 3.1838982105255127, + "learning_rate": 5.725727298595846e-07, + "loss": 0.3469, + "num_input_tokens_seen": 19075440, + "step": 29225 + }, + { + "epoch": 17.234669811320753, + "grad_norm": 3.5164546966552734, + "learning_rate": 5.71377860406726e-07, + "loss": 0.3471, + "num_input_tokens_seen": 19079728, + "step": 29230 + }, + { + "epoch": 17.2376179245283, + "grad_norm": 5.103067874908447, + "learning_rate": 5.701841634506655e-07, + "loss": 0.3067, + "num_input_tokens_seen": 19082640, + "step": 29235 + }, + { + "epoch": 17.24056603773585, + "grad_norm": 4.745849132537842, + "learning_rate": 5.689916393074391e-07, + "loss": 0.3328, + "num_input_tokens_seen": 19085456, + "step": 29240 + }, + { + "epoch": 17.243514150943398, + "grad_norm": 11.054007530212402, + "learning_rate": 5.678002882927725e-07, + "loss": 0.4058, + "num_input_tokens_seen": 19088304, + "step": 29245 + }, + { + "epoch": 17.246462264150942, + "grad_norm": 4.894809246063232, + "learning_rate": 5.666101107220811e-07, + "loss": 0.2134, + "num_input_tokens_seen": 19090672, + "step": 29250 + }, + { + "epoch": 17.24941037735849, + "grad_norm": 3.7762036323547363, + "learning_rate": 5.654211069104693e-07, + "loss": 0.2583, + "num_input_tokens_seen": 19094000, + "step": 29255 + }, + { + "epoch": 17.25235849056604, + "grad_norm": 3.5437495708465576, + "learning_rate": 5.642332771727321e-07, + "loss": 0.2633, + "num_input_tokens_seen": 19097040, + "step": 29260 + }, + { + "epoch": 17.255306603773583, + "grad_norm": 4.022183418273926, + "learning_rate": 5.630466218233521e-07, + "loss": 0.4488, + "num_input_tokens_seen": 19099984, + "step": 29265 + }, + { + "epoch": 17.25825471698113, + "grad_norm": 7.098259449005127, + "learning_rate": 5.618611411765007e-07, + "loss": 0.3237, + "num_input_tokens_seen": 19102576, + "step": 29270 + }, + { + "epoch": 17.26120283018868, + "grad_norm": 3.7024929523468018, + "learning_rate": 5.606768355460401e-07, + "loss": 0.2747, + "num_input_tokens_seen": 19105648, + "step": 29275 + }, + { + "epoch": 17.264150943396228, + "grad_norm": 4.162961959838867, + "learning_rate": 5.594937052455191e-07, + "loss": 0.2793, + "num_input_tokens_seen": 19108432, + "step": 29280 + }, + { + "epoch": 17.267099056603772, + "grad_norm": 8.350800514221191, + "learning_rate": 5.583117505881764e-07, + "loss": 0.2991, + "num_input_tokens_seen": 19112496, + "step": 29285 + }, + { + "epoch": 17.27004716981132, + "grad_norm": 3.2989964485168457, + "learning_rate": 5.571309718869417e-07, + "loss": 0.2687, + "num_input_tokens_seen": 19116528, + "step": 29290 + }, + { + "epoch": 17.27299528301887, + "grad_norm": 5.076319694519043, + "learning_rate": 5.559513694544282e-07, + "loss": 0.2841, + "num_input_tokens_seen": 19119664, + "step": 29295 + }, + { + "epoch": 17.275943396226417, + "grad_norm": 3.0870983600616455, + "learning_rate": 5.547729436029442e-07, + "loss": 0.3022, + "num_input_tokens_seen": 19122320, + "step": 29300 + }, + { + "epoch": 17.27889150943396, + "grad_norm": 3.8009376525878906, + "learning_rate": 5.535956946444809e-07, + "loss": 0.3635, + "num_input_tokens_seen": 19126096, + "step": 29305 + }, + { + "epoch": 17.28183962264151, + "grad_norm": 4.083861827850342, + "learning_rate": 5.524196228907203e-07, + "loss": 0.351, + "num_input_tokens_seen": 19130608, + "step": 29310 + }, + { + "epoch": 17.284787735849058, + "grad_norm": 4.0996575355529785, + "learning_rate": 5.512447286530326e-07, + "loss": 0.4131, + "num_input_tokens_seen": 19134832, + "step": 29315 + }, + { + "epoch": 17.287735849056602, + "grad_norm": 6.757421970367432, + "learning_rate": 5.500710122424746e-07, + "loss": 0.334, + "num_input_tokens_seen": 19138064, + "step": 29320 + }, + { + "epoch": 17.29068396226415, + "grad_norm": 9.738259315490723, + "learning_rate": 5.488984739697961e-07, + "loss": 0.2652, + "num_input_tokens_seen": 19140528, + "step": 29325 + }, + { + "epoch": 17.2936320754717, + "grad_norm": 4.88520622253418, + "learning_rate": 5.477271141454294e-07, + "loss": 0.3746, + "num_input_tokens_seen": 19143600, + "step": 29330 + }, + { + "epoch": 17.296580188679247, + "grad_norm": 2.0883452892303467, + "learning_rate": 5.465569330794974e-07, + "loss": 0.251, + "num_input_tokens_seen": 19146960, + "step": 29335 + }, + { + "epoch": 17.29952830188679, + "grad_norm": 3.2387006282806396, + "learning_rate": 5.453879310818105e-07, + "loss": 0.3181, + "num_input_tokens_seen": 19149488, + "step": 29340 + }, + { + "epoch": 17.30247641509434, + "grad_norm": 3.0843491554260254, + "learning_rate": 5.442201084618664e-07, + "loss": 0.2449, + "num_input_tokens_seen": 19152464, + "step": 29345 + }, + { + "epoch": 17.305424528301888, + "grad_norm": 3.8614251613616943, + "learning_rate": 5.430534655288528e-07, + "loss": 0.2897, + "num_input_tokens_seen": 19155856, + "step": 29350 + }, + { + "epoch": 17.308372641509433, + "grad_norm": 5.18664026260376, + "learning_rate": 5.418880025916428e-07, + "loss": 0.2634, + "num_input_tokens_seen": 19158704, + "step": 29355 + }, + { + "epoch": 17.31132075471698, + "grad_norm": 3.4826254844665527, + "learning_rate": 5.407237199587973e-07, + "loss": 0.2969, + "num_input_tokens_seen": 19162064, + "step": 29360 + }, + { + "epoch": 17.31426886792453, + "grad_norm": 3.203946590423584, + "learning_rate": 5.395606179385654e-07, + "loss": 0.4221, + "num_input_tokens_seen": 19165968, + "step": 29365 + }, + { + "epoch": 17.317216981132077, + "grad_norm": 3.0634963512420654, + "learning_rate": 5.383986968388833e-07, + "loss": 0.2409, + "num_input_tokens_seen": 19169424, + "step": 29370 + }, + { + "epoch": 17.32016509433962, + "grad_norm": 4.479260444641113, + "learning_rate": 5.372379569673736e-07, + "loss": 0.2774, + "num_input_tokens_seen": 19172176, + "step": 29375 + }, + { + "epoch": 17.32311320754717, + "grad_norm": 4.5797224044799805, + "learning_rate": 5.360783986313495e-07, + "loss": 0.2571, + "num_input_tokens_seen": 19175216, + "step": 29380 + }, + { + "epoch": 17.326061320754718, + "grad_norm": 7.678421974182129, + "learning_rate": 5.349200221378076e-07, + "loss": 0.3781, + "num_input_tokens_seen": 19177744, + "step": 29385 + }, + { + "epoch": 17.329009433962263, + "grad_norm": 3.3629536628723145, + "learning_rate": 5.33762827793432e-07, + "loss": 0.2195, + "num_input_tokens_seen": 19181552, + "step": 29390 + }, + { + "epoch": 17.33195754716981, + "grad_norm": 4.92515754699707, + "learning_rate": 5.326068159045978e-07, + "loss": 0.3001, + "num_input_tokens_seen": 19184272, + "step": 29395 + }, + { + "epoch": 17.33490566037736, + "grad_norm": 3.416231632232666, + "learning_rate": 5.314519867773621e-07, + "loss": 0.2516, + "num_input_tokens_seen": 19186736, + "step": 29400 + }, + { + "epoch": 17.337853773584907, + "grad_norm": 4.624024391174316, + "learning_rate": 5.302983407174711e-07, + "loss": 0.2914, + "num_input_tokens_seen": 19190032, + "step": 29405 + }, + { + "epoch": 17.340801886792452, + "grad_norm": 5.584841728210449, + "learning_rate": 5.291458780303572e-07, + "loss": 0.3048, + "num_input_tokens_seen": 19192560, + "step": 29410 + }, + { + "epoch": 17.34375, + "grad_norm": 4.910147666931152, + "learning_rate": 5.279945990211411e-07, + "loss": 0.2656, + "num_input_tokens_seen": 19195664, + "step": 29415 + }, + { + "epoch": 17.346698113207548, + "grad_norm": 5.374019622802734, + "learning_rate": 5.26844503994628e-07, + "loss": 0.2001, + "num_input_tokens_seen": 19198512, + "step": 29420 + }, + { + "epoch": 17.349646226415093, + "grad_norm": 3.2921152114868164, + "learning_rate": 5.25695593255311e-07, + "loss": 0.2826, + "num_input_tokens_seen": 19201296, + "step": 29425 + }, + { + "epoch": 17.35259433962264, + "grad_norm": 2.220937490463257, + "learning_rate": 5.24547867107369e-07, + "loss": 0.3581, + "num_input_tokens_seen": 19204496, + "step": 29430 + }, + { + "epoch": 17.35554245283019, + "grad_norm": 3.2003278732299805, + "learning_rate": 5.234013258546672e-07, + "loss": 0.352, + "num_input_tokens_seen": 19207280, + "step": 29435 + }, + { + "epoch": 17.358490566037737, + "grad_norm": 3.3672468662261963, + "learning_rate": 5.222559698007563e-07, + "loss": 0.3048, + "num_input_tokens_seen": 19210640, + "step": 29440 + }, + { + "epoch": 17.361438679245282, + "grad_norm": 4.007109642028809, + "learning_rate": 5.211117992488763e-07, + "loss": 0.3254, + "num_input_tokens_seen": 19213680, + "step": 29445 + }, + { + "epoch": 17.36438679245283, + "grad_norm": 8.020753860473633, + "learning_rate": 5.199688145019505e-07, + "loss": 0.3811, + "num_input_tokens_seen": 19216336, + "step": 29450 + }, + { + "epoch": 17.36733490566038, + "grad_norm": 2.816948413848877, + "learning_rate": 5.188270158625891e-07, + "loss": 0.2949, + "num_input_tokens_seen": 19219600, + "step": 29455 + }, + { + "epoch": 17.370283018867923, + "grad_norm": 6.062239170074463, + "learning_rate": 5.176864036330875e-07, + "loss": 0.4705, + "num_input_tokens_seen": 19222000, + "step": 29460 + }, + { + "epoch": 17.37323113207547, + "grad_norm": 3.4085443019866943, + "learning_rate": 5.165469781154287e-07, + "loss": 0.2213, + "num_input_tokens_seen": 19226320, + "step": 29465 + }, + { + "epoch": 17.37617924528302, + "grad_norm": 3.9273881912231445, + "learning_rate": 5.154087396112789e-07, + "loss": 0.2652, + "num_input_tokens_seen": 19230448, + "step": 29470 + }, + { + "epoch": 17.379127358490567, + "grad_norm": 2.818270683288574, + "learning_rate": 5.142716884219939e-07, + "loss": 0.2501, + "num_input_tokens_seen": 19233392, + "step": 29475 + }, + { + "epoch": 17.382075471698112, + "grad_norm": 7.058661460876465, + "learning_rate": 5.131358248486118e-07, + "loss": 0.2874, + "num_input_tokens_seen": 19236432, + "step": 29480 + }, + { + "epoch": 17.38502358490566, + "grad_norm": 4.268390655517578, + "learning_rate": 5.120011491918564e-07, + "loss": 0.2896, + "num_input_tokens_seen": 19240496, + "step": 29485 + }, + { + "epoch": 17.38797169811321, + "grad_norm": 4.086554527282715, + "learning_rate": 5.108676617521402e-07, + "loss": 0.338, + "num_input_tokens_seen": 19242960, + "step": 29490 + }, + { + "epoch": 17.390919811320753, + "grad_norm": 2.0774359703063965, + "learning_rate": 5.097353628295571e-07, + "loss": 0.3103, + "num_input_tokens_seen": 19246032, + "step": 29495 + }, + { + "epoch": 17.3938679245283, + "grad_norm": 6.527927398681641, + "learning_rate": 5.086042527238893e-07, + "loss": 0.3332, + "num_input_tokens_seen": 19249488, + "step": 29500 + }, + { + "epoch": 17.39681603773585, + "grad_norm": 6.069390773773193, + "learning_rate": 5.074743317346009e-07, + "loss": 0.3523, + "num_input_tokens_seen": 19252784, + "step": 29505 + }, + { + "epoch": 17.399764150943398, + "grad_norm": 3.542381525039673, + "learning_rate": 5.063456001608458e-07, + "loss": 0.5155, + "num_input_tokens_seen": 19256080, + "step": 29510 + }, + { + "epoch": 17.402712264150942, + "grad_norm": 4.585089683532715, + "learning_rate": 5.052180583014599e-07, + "loss": 0.2626, + "num_input_tokens_seen": 19258736, + "step": 29515 + }, + { + "epoch": 17.40566037735849, + "grad_norm": 3.1628377437591553, + "learning_rate": 5.04091706454965e-07, + "loss": 0.2374, + "num_input_tokens_seen": 19261072, + "step": 29520 + }, + { + "epoch": 17.40860849056604, + "grad_norm": 4.301898956298828, + "learning_rate": 5.029665449195665e-07, + "loss": 0.326, + "num_input_tokens_seen": 19263632, + "step": 29525 + }, + { + "epoch": 17.411556603773583, + "grad_norm": 1.9644662141799927, + "learning_rate": 5.018425739931559e-07, + "loss": 0.2177, + "num_input_tokens_seen": 19266320, + "step": 29530 + }, + { + "epoch": 17.41450471698113, + "grad_norm": 5.495102405548096, + "learning_rate": 5.007197939733099e-07, + "loss": 0.2709, + "num_input_tokens_seen": 19270320, + "step": 29535 + }, + { + "epoch": 17.41745283018868, + "grad_norm": 3.2260141372680664, + "learning_rate": 4.995982051572895e-07, + "loss": 0.2668, + "num_input_tokens_seen": 19274032, + "step": 29540 + }, + { + "epoch": 17.420400943396228, + "grad_norm": 4.1187872886657715, + "learning_rate": 4.984778078420405e-07, + "loss": 0.2502, + "num_input_tokens_seen": 19277520, + "step": 29545 + }, + { + "epoch": 17.423349056603772, + "grad_norm": 3.1905806064605713, + "learning_rate": 4.973586023241917e-07, + "loss": 0.3827, + "num_input_tokens_seen": 19280336, + "step": 29550 + }, + { + "epoch": 17.42629716981132, + "grad_norm": 7.913694381713867, + "learning_rate": 4.962405889000588e-07, + "loss": 0.3391, + "num_input_tokens_seen": 19283472, + "step": 29555 + }, + { + "epoch": 17.42924528301887, + "grad_norm": 3.9036004543304443, + "learning_rate": 4.951237678656396e-07, + "loss": 0.3236, + "num_input_tokens_seen": 19287024, + "step": 29560 + }, + { + "epoch": 17.432193396226417, + "grad_norm": 3.0127153396606445, + "learning_rate": 4.940081395166174e-07, + "loss": 0.3463, + "num_input_tokens_seen": 19290544, + "step": 29565 + }, + { + "epoch": 17.43514150943396, + "grad_norm": 3.3962459564208984, + "learning_rate": 4.928937041483606e-07, + "loss": 0.3387, + "num_input_tokens_seen": 19295184, + "step": 29570 + }, + { + "epoch": 17.43808962264151, + "grad_norm": 1.5757339000701904, + "learning_rate": 4.917804620559202e-07, + "loss": 0.1971, + "num_input_tokens_seen": 19298160, + "step": 29575 + }, + { + "epoch": 17.441037735849058, + "grad_norm": 4.236644744873047, + "learning_rate": 4.906684135340317e-07, + "loss": 0.4083, + "num_input_tokens_seen": 19300976, + "step": 29580 + }, + { + "epoch": 17.443985849056602, + "grad_norm": 3.140695810317993, + "learning_rate": 4.89557558877114e-07, + "loss": 0.457, + "num_input_tokens_seen": 19304048, + "step": 29585 + }, + { + "epoch": 17.44693396226415, + "grad_norm": 4.475461959838867, + "learning_rate": 4.884478983792728e-07, + "loss": 0.2452, + "num_input_tokens_seen": 19307248, + "step": 29590 + }, + { + "epoch": 17.4498820754717, + "grad_norm": 3.385934591293335, + "learning_rate": 4.873394323342939e-07, + "loss": 0.2956, + "num_input_tokens_seen": 19310608, + "step": 29595 + }, + { + "epoch": 17.452830188679247, + "grad_norm": 4.398652076721191, + "learning_rate": 4.86232161035648e-07, + "loss": 0.2721, + "num_input_tokens_seen": 19313584, + "step": 29600 + }, + { + "epoch": 17.45577830188679, + "grad_norm": 3.378872871398926, + "learning_rate": 4.851260847764916e-07, + "loss": 0.301, + "num_input_tokens_seen": 19317968, + "step": 29605 + }, + { + "epoch": 17.45872641509434, + "grad_norm": 2.934174060821533, + "learning_rate": 4.840212038496622e-07, + "loss": 0.317, + "num_input_tokens_seen": 19321104, + "step": 29610 + }, + { + "epoch": 17.461674528301888, + "grad_norm": 4.517685890197754, + "learning_rate": 4.82917518547682e-07, + "loss": 0.331, + "num_input_tokens_seen": 19324656, + "step": 29615 + }, + { + "epoch": 17.464622641509433, + "grad_norm": 3.247096061706543, + "learning_rate": 4.81815029162756e-07, + "loss": 0.3208, + "num_input_tokens_seen": 19327920, + "step": 29620 + }, + { + "epoch": 17.46757075471698, + "grad_norm": 4.3389787673950195, + "learning_rate": 4.807137359867725e-07, + "loss": 0.291, + "num_input_tokens_seen": 19330800, + "step": 29625 + }, + { + "epoch": 17.47051886792453, + "grad_norm": 3.516417980194092, + "learning_rate": 4.79613639311306e-07, + "loss": 0.2731, + "num_input_tokens_seen": 19334320, + "step": 29630 + }, + { + "epoch": 17.473466981132077, + "grad_norm": 3.692194938659668, + "learning_rate": 4.785147394276096e-07, + "loss": 0.2686, + "num_input_tokens_seen": 19337648, + "step": 29635 + }, + { + "epoch": 17.47641509433962, + "grad_norm": 3.881232261657715, + "learning_rate": 4.774170366266223e-07, + "loss": 0.3165, + "num_input_tokens_seen": 19340976, + "step": 29640 + }, + { + "epoch": 17.47936320754717, + "grad_norm": 4.988696098327637, + "learning_rate": 4.763205311989666e-07, + "loss": 0.4306, + "num_input_tokens_seen": 19344080, + "step": 29645 + }, + { + "epoch": 17.482311320754718, + "grad_norm": 2.917496681213379, + "learning_rate": 4.752252234349458e-07, + "loss": 0.4194, + "num_input_tokens_seen": 19348816, + "step": 29650 + }, + { + "epoch": 17.485259433962263, + "grad_norm": 2.9181694984436035, + "learning_rate": 4.7413111362454634e-07, + "loss": 0.2636, + "num_input_tokens_seen": 19351824, + "step": 29655 + }, + { + "epoch": 17.48820754716981, + "grad_norm": 2.9263641834259033, + "learning_rate": 4.7303820205744143e-07, + "loss": 0.311, + "num_input_tokens_seen": 19356368, + "step": 29660 + }, + { + "epoch": 17.49115566037736, + "grad_norm": 5.925682067871094, + "learning_rate": 4.7194648902298303e-07, + "loss": 0.3681, + "num_input_tokens_seen": 19359536, + "step": 29665 + }, + { + "epoch": 17.494103773584907, + "grad_norm": 3.42815899848938, + "learning_rate": 4.7085597481020594e-07, + "loss": 0.2564, + "num_input_tokens_seen": 19362832, + "step": 29670 + }, + { + "epoch": 17.497051886792452, + "grad_norm": 3.28584623336792, + "learning_rate": 4.697666597078293e-07, + "loss": 0.4519, + "num_input_tokens_seen": 19365264, + "step": 29675 + }, + { + "epoch": 17.5, + "grad_norm": 3.416677236557007, + "learning_rate": 4.6867854400425237e-07, + "loss": 0.276, + "num_input_tokens_seen": 19368048, + "step": 29680 + }, + { + "epoch": 17.502948113207548, + "grad_norm": 3.6103389263153076, + "learning_rate": 4.6759162798756084e-07, + "loss": 0.3494, + "num_input_tokens_seen": 19371248, + "step": 29685 + }, + { + "epoch": 17.505896226415093, + "grad_norm": 3.540623188018799, + "learning_rate": 4.6650591194551895e-07, + "loss": 0.3489, + "num_input_tokens_seen": 19373904, + "step": 29690 + }, + { + "epoch": 17.50884433962264, + "grad_norm": 4.782562732696533, + "learning_rate": 4.654213961655757e-07, + "loss": 0.2702, + "num_input_tokens_seen": 19377168, + "step": 29695 + }, + { + "epoch": 17.51179245283019, + "grad_norm": 4.321146011352539, + "learning_rate": 4.6433808093486075e-07, + "loss": 0.2467, + "num_input_tokens_seen": 19380560, + "step": 29700 + }, + { + "epoch": 17.514740566037737, + "grad_norm": 3.540022850036621, + "learning_rate": 4.63255966540187e-07, + "loss": 0.3421, + "num_input_tokens_seen": 19383728, + "step": 29705 + }, + { + "epoch": 17.517688679245282, + "grad_norm": 4.388162136077881, + "learning_rate": 4.62175053268048e-07, + "loss": 0.3978, + "num_input_tokens_seen": 19387248, + "step": 29710 + }, + { + "epoch": 17.52063679245283, + "grad_norm": 4.140903472900391, + "learning_rate": 4.6109534140462045e-07, + "loss": 0.3003, + "num_input_tokens_seen": 19389968, + "step": 29715 + }, + { + "epoch": 17.52358490566038, + "grad_norm": 2.79817271232605, + "learning_rate": 4.6001683123576226e-07, + "loss": 0.2801, + "num_input_tokens_seen": 19393232, + "step": 29720 + }, + { + "epoch": 17.526533018867923, + "grad_norm": 7.1996917724609375, + "learning_rate": 4.589395230470145e-07, + "loss": 0.2981, + "num_input_tokens_seen": 19396016, + "step": 29725 + }, + { + "epoch": 17.52948113207547, + "grad_norm": 3.436506748199463, + "learning_rate": 4.578634171235996e-07, + "loss": 0.272, + "num_input_tokens_seen": 19399248, + "step": 29730 + }, + { + "epoch": 17.53242924528302, + "grad_norm": 5.760735034942627, + "learning_rate": 4.567885137504202e-07, + "loss": 0.4159, + "num_input_tokens_seen": 19402544, + "step": 29735 + }, + { + "epoch": 17.535377358490567, + "grad_norm": 5.711773872375488, + "learning_rate": 4.55714813212062e-07, + "loss": 0.2357, + "num_input_tokens_seen": 19405264, + "step": 29740 + }, + { + "epoch": 17.538325471698112, + "grad_norm": 4.368917465209961, + "learning_rate": 4.5464231579279206e-07, + "loss": 0.319, + "num_input_tokens_seen": 19408720, + "step": 29745 + }, + { + "epoch": 17.54127358490566, + "grad_norm": 1.6674206256866455, + "learning_rate": 4.535710217765571e-07, + "loss": 0.264, + "num_input_tokens_seen": 19412304, + "step": 29750 + }, + { + "epoch": 17.54422169811321, + "grad_norm": 3.3602254390716553, + "learning_rate": 4.5250093144698913e-07, + "loss": 0.2706, + "num_input_tokens_seen": 19415216, + "step": 29755 + }, + { + "epoch": 17.547169811320753, + "grad_norm": 4.324750900268555, + "learning_rate": 4.514320450873988e-07, + "loss": 0.2453, + "num_input_tokens_seen": 19419600, + "step": 29760 + }, + { + "epoch": 17.5501179245283, + "grad_norm": 6.401323318481445, + "learning_rate": 4.503643629807769e-07, + "loss": 0.2474, + "num_input_tokens_seen": 19421872, + "step": 29765 + }, + { + "epoch": 17.55306603773585, + "grad_norm": 2.8764748573303223, + "learning_rate": 4.4929788540979844e-07, + "loss": 0.3223, + "num_input_tokens_seen": 19425136, + "step": 29770 + }, + { + "epoch": 17.556014150943398, + "grad_norm": 8.648366928100586, + "learning_rate": 4.4823261265681596e-07, + "loss": 0.3028, + "num_input_tokens_seen": 19430288, + "step": 29775 + }, + { + "epoch": 17.558962264150942, + "grad_norm": 2.7787599563598633, + "learning_rate": 4.471685450038671e-07, + "loss": 0.3121, + "num_input_tokens_seen": 19433392, + "step": 29780 + }, + { + "epoch": 17.56191037735849, + "grad_norm": 3.8126487731933594, + "learning_rate": 4.4610568273266706e-07, + "loss": 0.2639, + "num_input_tokens_seen": 19436752, + "step": 29785 + }, + { + "epoch": 17.56485849056604, + "grad_norm": 2.481271982192993, + "learning_rate": 4.450440261246142e-07, + "loss": 0.291, + "num_input_tokens_seen": 19440112, + "step": 29790 + }, + { + "epoch": 17.567806603773583, + "grad_norm": 7.814016342163086, + "learning_rate": 4.439835754607863e-07, + "loss": 0.2637, + "num_input_tokens_seen": 19443088, + "step": 29795 + }, + { + "epoch": 17.57075471698113, + "grad_norm": 2.29601788520813, + "learning_rate": 4.429243310219422e-07, + "loss": 0.3194, + "num_input_tokens_seen": 19447056, + "step": 29800 + }, + { + "epoch": 17.57370283018868, + "grad_norm": 3.193683385848999, + "learning_rate": 4.418662930885215e-07, + "loss": 0.3207, + "num_input_tokens_seen": 19450640, + "step": 29805 + }, + { + "epoch": 17.576650943396228, + "grad_norm": 4.817287921905518, + "learning_rate": 4.408094619406439e-07, + "loss": 0.2822, + "num_input_tokens_seen": 19453840, + "step": 29810 + }, + { + "epoch": 17.579599056603772, + "grad_norm": 4.155791282653809, + "learning_rate": 4.3975383785810954e-07, + "loss": 0.2301, + "num_input_tokens_seen": 19457424, + "step": 29815 + }, + { + "epoch": 17.58254716981132, + "grad_norm": 3.9995663166046143, + "learning_rate": 4.3869942112040096e-07, + "loss": 0.3121, + "num_input_tokens_seen": 19463376, + "step": 29820 + }, + { + "epoch": 17.58549528301887, + "grad_norm": 3.8244709968566895, + "learning_rate": 4.3764621200667936e-07, + "loss": 0.3803, + "num_input_tokens_seen": 19466736, + "step": 29825 + }, + { + "epoch": 17.588443396226417, + "grad_norm": 4.089743614196777, + "learning_rate": 4.365942107957849e-07, + "loss": 0.2282, + "num_input_tokens_seen": 19469168, + "step": 29830 + }, + { + "epoch": 17.59139150943396, + "grad_norm": 8.447809219360352, + "learning_rate": 4.35543417766241e-07, + "loss": 0.2505, + "num_input_tokens_seen": 19472688, + "step": 29835 + }, + { + "epoch": 17.59433962264151, + "grad_norm": 7.290267467498779, + "learning_rate": 4.3449383319624785e-07, + "loss": 0.3478, + "num_input_tokens_seen": 19476016, + "step": 29840 + }, + { + "epoch": 17.597287735849058, + "grad_norm": 3.0142695903778076, + "learning_rate": 4.3344545736368926e-07, + "loss": 0.3952, + "num_input_tokens_seen": 19479248, + "step": 29845 + }, + { + "epoch": 17.600235849056602, + "grad_norm": 3.7160372734069824, + "learning_rate": 4.323982905461266e-07, + "loss": 0.3353, + "num_input_tokens_seen": 19483152, + "step": 29850 + }, + { + "epoch": 17.60318396226415, + "grad_norm": 3.0339345932006836, + "learning_rate": 4.313523330208019e-07, + "loss": 0.3586, + "num_input_tokens_seen": 19486864, + "step": 29855 + }, + { + "epoch": 17.6061320754717, + "grad_norm": 3.8709583282470703, + "learning_rate": 4.303075850646371e-07, + "loss": 0.2687, + "num_input_tokens_seen": 19489680, + "step": 29860 + }, + { + "epoch": 17.609080188679247, + "grad_norm": 4.574602127075195, + "learning_rate": 4.2926404695423305e-07, + "loss": 0.3526, + "num_input_tokens_seen": 19492080, + "step": 29865 + }, + { + "epoch": 17.61202830188679, + "grad_norm": 3.695984125137329, + "learning_rate": 4.282217189658705e-07, + "loss": 0.313, + "num_input_tokens_seen": 19495184, + "step": 29870 + }, + { + "epoch": 17.61497641509434, + "grad_norm": 4.408347129821777, + "learning_rate": 4.27180601375512e-07, + "loss": 0.354, + "num_input_tokens_seen": 19498160, + "step": 29875 + }, + { + "epoch": 17.617924528301888, + "grad_norm": 3.8960680961608887, + "learning_rate": 4.2614069445879646e-07, + "loss": 0.3712, + "num_input_tokens_seen": 19501968, + "step": 29880 + }, + { + "epoch": 17.620872641509433, + "grad_norm": 4.106062889099121, + "learning_rate": 4.251019984910448e-07, + "loss": 0.4004, + "num_input_tokens_seen": 19505072, + "step": 29885 + }, + { + "epoch": 17.62382075471698, + "grad_norm": 5.152200698852539, + "learning_rate": 4.2406451374725597e-07, + "loss": 0.3436, + "num_input_tokens_seen": 19508016, + "step": 29890 + }, + { + "epoch": 17.62676886792453, + "grad_norm": 2.682178020477295, + "learning_rate": 4.2302824050210855e-07, + "loss": 0.2946, + "num_input_tokens_seen": 19510640, + "step": 29895 + }, + { + "epoch": 17.629716981132077, + "grad_norm": 3.4632980823516846, + "learning_rate": 4.2199317902995974e-07, + "loss": 0.2977, + "num_input_tokens_seen": 19515120, + "step": 29900 + }, + { + "epoch": 17.63266509433962, + "grad_norm": 5.079971790313721, + "learning_rate": 4.209593296048459e-07, + "loss": 0.2225, + "num_input_tokens_seen": 19517616, + "step": 29905 + }, + { + "epoch": 17.63561320754717, + "grad_norm": 3.9023067951202393, + "learning_rate": 4.1992669250048524e-07, + "loss": 0.3369, + "num_input_tokens_seen": 19520976, + "step": 29910 + }, + { + "epoch": 17.638561320754718, + "grad_norm": 4.456829071044922, + "learning_rate": 4.188952679902719e-07, + "loss": 0.4101, + "num_input_tokens_seen": 19523984, + "step": 29915 + }, + { + "epoch": 17.641509433962263, + "grad_norm": 3.4006335735321045, + "learning_rate": 4.178650563472797e-07, + "loss": 0.3525, + "num_input_tokens_seen": 19527600, + "step": 29920 + }, + { + "epoch": 17.64445754716981, + "grad_norm": 2.8123559951782227, + "learning_rate": 4.168360578442615e-07, + "loss": 0.1958, + "num_input_tokens_seen": 19531088, + "step": 29925 + }, + { + "epoch": 17.64740566037736, + "grad_norm": 4.697080135345459, + "learning_rate": 4.1580827275365e-07, + "loss": 0.4181, + "num_input_tokens_seen": 19534096, + "step": 29930 + }, + { + "epoch": 17.650353773584907, + "grad_norm": 5.454930782318115, + "learning_rate": 4.147817013475536e-07, + "loss": 0.2538, + "num_input_tokens_seen": 19537104, + "step": 29935 + }, + { + "epoch": 17.653301886792452, + "grad_norm": 3.5876967906951904, + "learning_rate": 4.1375634389776375e-07, + "loss": 0.364, + "num_input_tokens_seen": 19539792, + "step": 29940 + }, + { + "epoch": 17.65625, + "grad_norm": 3.8499343395233154, + "learning_rate": 4.127322006757478e-07, + "loss": 0.3605, + "num_input_tokens_seen": 19542736, + "step": 29945 + }, + { + "epoch": 17.659198113207548, + "grad_norm": 2.883835554122925, + "learning_rate": 4.1170927195265163e-07, + "loss": 0.3084, + "num_input_tokens_seen": 19545808, + "step": 29950 + }, + { + "epoch": 17.662146226415093, + "grad_norm": 3.778754949569702, + "learning_rate": 4.1068755799930026e-07, + "loss": 0.3281, + "num_input_tokens_seen": 19548560, + "step": 29955 + }, + { + "epoch": 17.66509433962264, + "grad_norm": 6.076083660125732, + "learning_rate": 4.096670590861962e-07, + "loss": 0.2985, + "num_input_tokens_seen": 19551152, + "step": 29960 + }, + { + "epoch": 17.66804245283019, + "grad_norm": 3.78519344329834, + "learning_rate": 4.086477754835211e-07, + "loss": 0.2938, + "num_input_tokens_seen": 19554096, + "step": 29965 + }, + { + "epoch": 17.670990566037737, + "grad_norm": 3.2133800983428955, + "learning_rate": 4.0762970746113517e-07, + "loss": 0.2838, + "num_input_tokens_seen": 19557712, + "step": 29970 + }, + { + "epoch": 17.673938679245282, + "grad_norm": 3.8399910926818848, + "learning_rate": 4.0661285528857676e-07, + "loss": 0.3082, + "num_input_tokens_seen": 19560752, + "step": 29975 + }, + { + "epoch": 17.67688679245283, + "grad_norm": 3.958176612854004, + "learning_rate": 4.0559721923506155e-07, + "loss": 0.269, + "num_input_tokens_seen": 19563952, + "step": 29980 + }, + { + "epoch": 17.67983490566038, + "grad_norm": 5.1226325035095215, + "learning_rate": 4.045827995694834e-07, + "loss": 0.3325, + "num_input_tokens_seen": 19566832, + "step": 29985 + }, + { + "epoch": 17.682783018867923, + "grad_norm": 4.479787826538086, + "learning_rate": 4.035695965604142e-07, + "loss": 0.3582, + "num_input_tokens_seen": 19569840, + "step": 29990 + }, + { + "epoch": 17.68573113207547, + "grad_norm": 2.767713785171509, + "learning_rate": 4.0255761047610365e-07, + "loss": 0.2413, + "num_input_tokens_seen": 19572944, + "step": 29995 + }, + { + "epoch": 17.68867924528302, + "grad_norm": 6.384271144866943, + "learning_rate": 4.0154684158447864e-07, + "loss": 0.3416, + "num_input_tokens_seen": 19576880, + "step": 30000 + }, + { + "epoch": 17.691627358490567, + "grad_norm": 5.059001922607422, + "learning_rate": 4.0053729015314623e-07, + "loss": 0.2505, + "num_input_tokens_seen": 19580240, + "step": 30005 + }, + { + "epoch": 17.694575471698112, + "grad_norm": 5.936522483825684, + "learning_rate": 3.9952895644938926e-07, + "loss": 0.3053, + "num_input_tokens_seen": 19583280, + "step": 30010 + }, + { + "epoch": 17.69752358490566, + "grad_norm": 4.144394874572754, + "learning_rate": 3.985218407401681e-07, + "loss": 0.3301, + "num_input_tokens_seen": 19586736, + "step": 30015 + }, + { + "epoch": 17.70047169811321, + "grad_norm": 3.082507848739624, + "learning_rate": 3.975159432921205e-07, + "loss": 0.2569, + "num_input_tokens_seen": 19590544, + "step": 30020 + }, + { + "epoch": 17.703419811320753, + "grad_norm": 3.03660249710083, + "learning_rate": 3.9651126437156294e-07, + "loss": 0.2829, + "num_input_tokens_seen": 19594160, + "step": 30025 + }, + { + "epoch": 17.7063679245283, + "grad_norm": 5.3290581703186035, + "learning_rate": 3.9550780424448653e-07, + "loss": 0.2393, + "num_input_tokens_seen": 19597296, + "step": 30030 + }, + { + "epoch": 17.70931603773585, + "grad_norm": 2.8234241008758545, + "learning_rate": 3.9450556317656487e-07, + "loss": 0.3034, + "num_input_tokens_seen": 19600976, + "step": 30035 + }, + { + "epoch": 17.712264150943398, + "grad_norm": 5.621461391448975, + "learning_rate": 3.935045414331434e-07, + "loss": 0.3863, + "num_input_tokens_seen": 19605008, + "step": 30040 + }, + { + "epoch": 17.715212264150942, + "grad_norm": 2.3685083389282227, + "learning_rate": 3.925047392792475e-07, + "loss": 0.3222, + "num_input_tokens_seen": 19608944, + "step": 30045 + }, + { + "epoch": 17.71816037735849, + "grad_norm": 7.218699932098389, + "learning_rate": 3.9150615697957917e-07, + "loss": 0.2717, + "num_input_tokens_seen": 19612016, + "step": 30050 + }, + { + "epoch": 17.72110849056604, + "grad_norm": 3.70358943939209, + "learning_rate": 3.9050879479851753e-07, + "loss": 0.3504, + "num_input_tokens_seen": 19615632, + "step": 30055 + }, + { + "epoch": 17.724056603773583, + "grad_norm": 6.316326141357422, + "learning_rate": 3.89512653000117e-07, + "loss": 0.3382, + "num_input_tokens_seen": 19618160, + "step": 30060 + }, + { + "epoch": 17.72700471698113, + "grad_norm": 3.5059216022491455, + "learning_rate": 3.8851773184811203e-07, + "loss": 0.3233, + "num_input_tokens_seen": 19620784, + "step": 30065 + }, + { + "epoch": 17.72995283018868, + "grad_norm": 6.4526543617248535, + "learning_rate": 3.8752403160591255e-07, + "loss": 0.2703, + "num_input_tokens_seen": 19623408, + "step": 30070 + }, + { + "epoch": 17.732900943396228, + "grad_norm": 2.48557448387146, + "learning_rate": 3.8653155253660477e-07, + "loss": 0.3463, + "num_input_tokens_seen": 19626832, + "step": 30075 + }, + { + "epoch": 17.735849056603772, + "grad_norm": 3.0013647079467773, + "learning_rate": 3.8554029490295073e-07, + "loss": 0.2681, + "num_input_tokens_seen": 19630224, + "step": 30080 + }, + { + "epoch": 17.73879716981132, + "grad_norm": 2.65704345703125, + "learning_rate": 3.8455025896739164e-07, + "loss": 0.2114, + "num_input_tokens_seen": 19632656, + "step": 30085 + }, + { + "epoch": 17.74174528301887, + "grad_norm": 4.569005012512207, + "learning_rate": 3.8356144499204215e-07, + "loss": 0.3769, + "num_input_tokens_seen": 19636400, + "step": 30090 + }, + { + "epoch": 17.744693396226417, + "grad_norm": 3.1655654907226562, + "learning_rate": 3.8257385323869576e-07, + "loss": 0.3341, + "num_input_tokens_seen": 19639440, + "step": 30095 + }, + { + "epoch": 17.74764150943396, + "grad_norm": 2.605433940887451, + "learning_rate": 3.815874839688222e-07, + "loss": 0.2892, + "num_input_tokens_seen": 19643536, + "step": 30100 + }, + { + "epoch": 17.75058962264151, + "grad_norm": 4.5944390296936035, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.2481, + "num_input_tokens_seen": 19646224, + "step": 30105 + }, + { + "epoch": 17.753537735849058, + "grad_norm": 3.501011371612549, + "learning_rate": 3.796184139237502e-07, + "loss": 0.2878, + "num_input_tokens_seen": 19648848, + "step": 30110 + }, + { + "epoch": 17.756485849056602, + "grad_norm": 3.2684073448181152, + "learning_rate": 3.7863571366987206e-07, + "loss": 0.2424, + "num_input_tokens_seen": 19652560, + "step": 30115 + }, + { + "epoch": 17.75943396226415, + "grad_norm": 3.3470709323883057, + "learning_rate": 3.776542369421049e-07, + "loss": 0.2921, + "num_input_tokens_seen": 19655440, + "step": 30120 + }, + { + "epoch": 17.7623820754717, + "grad_norm": 6.413597583770752, + "learning_rate": 3.766739840003003e-07, + "loss": 0.2471, + "num_input_tokens_seen": 19657936, + "step": 30125 + }, + { + "epoch": 17.765330188679247, + "grad_norm": 3.7638256549835205, + "learning_rate": 3.756949551039835e-07, + "loss": 0.3031, + "num_input_tokens_seen": 19662544, + "step": 30130 + }, + { + "epoch": 17.76827830188679, + "grad_norm": 4.620702743530273, + "learning_rate": 3.7471715051235757e-07, + "loss": 0.2436, + "num_input_tokens_seen": 19666416, + "step": 30135 + }, + { + "epoch": 17.77122641509434, + "grad_norm": 5.6916913986206055, + "learning_rate": 3.7374057048429947e-07, + "loss": 0.3651, + "num_input_tokens_seen": 19669040, + "step": 30140 + }, + { + "epoch": 17.774174528301888, + "grad_norm": 3.1047825813293457, + "learning_rate": 3.7276521527836396e-07, + "loss": 0.4039, + "num_input_tokens_seen": 19673648, + "step": 30145 + }, + { + "epoch": 17.777122641509433, + "grad_norm": 7.431186676025391, + "learning_rate": 3.717910851527784e-07, + "loss": 0.2903, + "num_input_tokens_seen": 19676528, + "step": 30150 + }, + { + "epoch": 17.78007075471698, + "grad_norm": 5.141274452209473, + "learning_rate": 3.708181803654498e-07, + "loss": 0.244, + "num_input_tokens_seen": 19679696, + "step": 30155 + }, + { + "epoch": 17.78301886792453, + "grad_norm": 3.8741374015808105, + "learning_rate": 3.6984650117395993e-07, + "loss": 0.2425, + "num_input_tokens_seen": 19682512, + "step": 30160 + }, + { + "epoch": 17.785966981132077, + "grad_norm": 1.933734655380249, + "learning_rate": 3.688760478355635e-07, + "loss": 0.2804, + "num_input_tokens_seen": 19685552, + "step": 30165 + }, + { + "epoch": 17.78891509433962, + "grad_norm": 5.843873977661133, + "learning_rate": 3.679068206071923e-07, + "loss": 0.3399, + "num_input_tokens_seen": 19688048, + "step": 30170 + }, + { + "epoch": 17.79186320754717, + "grad_norm": 4.484386920928955, + "learning_rate": 3.669388197454532e-07, + "loss": 0.2963, + "num_input_tokens_seen": 19690960, + "step": 30175 + }, + { + "epoch": 17.794811320754718, + "grad_norm": 3.630345582962036, + "learning_rate": 3.6597204550662956e-07, + "loss": 0.2896, + "num_input_tokens_seen": 19693936, + "step": 30180 + }, + { + "epoch": 17.797759433962263, + "grad_norm": 4.386622428894043, + "learning_rate": 3.650064981466772e-07, + "loss": 0.29, + "num_input_tokens_seen": 19697072, + "step": 30185 + }, + { + "epoch": 17.80070754716981, + "grad_norm": 3.656722068786621, + "learning_rate": 3.640421779212311e-07, + "loss": 0.2891, + "num_input_tokens_seen": 19699696, + "step": 30190 + }, + { + "epoch": 17.80365566037736, + "grad_norm": 5.5976972579956055, + "learning_rate": 3.630790850855986e-07, + "loss": 0.323, + "num_input_tokens_seen": 19702928, + "step": 30195 + }, + { + "epoch": 17.806603773584907, + "grad_norm": 4.749702453613281, + "learning_rate": 3.62117219894762e-07, + "loss": 0.2993, + "num_input_tokens_seen": 19706064, + "step": 30200 + }, + { + "epoch": 17.809551886792452, + "grad_norm": 7.9056782722473145, + "learning_rate": 3.611565826033797e-07, + "loss": 0.4681, + "num_input_tokens_seen": 19709520, + "step": 30205 + }, + { + "epoch": 17.8125, + "grad_norm": 4.5009002685546875, + "learning_rate": 3.6019717346578445e-07, + "loss": 0.3719, + "num_input_tokens_seen": 19716112, + "step": 30210 + }, + { + "epoch": 17.815448113207548, + "grad_norm": 8.460150718688965, + "learning_rate": 3.5923899273598293e-07, + "loss": 0.3702, + "num_input_tokens_seen": 19719056, + "step": 30215 + }, + { + "epoch": 17.818396226415093, + "grad_norm": 3.733607530593872, + "learning_rate": 3.582820406676596e-07, + "loss": 0.2669, + "num_input_tokens_seen": 19721456, + "step": 30220 + }, + { + "epoch": 17.82134433962264, + "grad_norm": 6.31386137008667, + "learning_rate": 3.5732631751417056e-07, + "loss": 0.3417, + "num_input_tokens_seen": 19724432, + "step": 30225 + }, + { + "epoch": 17.82429245283019, + "grad_norm": 3.8544058799743652, + "learning_rate": 3.563718235285485e-07, + "loss": 0.356, + "num_input_tokens_seen": 19728496, + "step": 30230 + }, + { + "epoch": 17.827240566037737, + "grad_norm": 5.316858768463135, + "learning_rate": 3.5541855896349844e-07, + "loss": 0.2916, + "num_input_tokens_seen": 19731120, + "step": 30235 + }, + { + "epoch": 17.830188679245282, + "grad_norm": 4.305591583251953, + "learning_rate": 3.544665240714018e-07, + "loss": 0.2883, + "num_input_tokens_seen": 19734192, + "step": 30240 + }, + { + "epoch": 17.83313679245283, + "grad_norm": 5.590890884399414, + "learning_rate": 3.535157191043137e-07, + "loss": 0.4068, + "num_input_tokens_seen": 19737104, + "step": 30245 + }, + { + "epoch": 17.83608490566038, + "grad_norm": 5.575997352600098, + "learning_rate": 3.5256614431396385e-07, + "loss": 0.3138, + "num_input_tokens_seen": 19742480, + "step": 30250 + }, + { + "epoch": 17.839033018867923, + "grad_norm": 4.585902690887451, + "learning_rate": 3.516177999517578e-07, + "loss": 0.2504, + "num_input_tokens_seen": 19745040, + "step": 30255 + }, + { + "epoch": 17.84198113207547, + "grad_norm": 2.8222391605377197, + "learning_rate": 3.50670686268772e-07, + "loss": 0.2715, + "num_input_tokens_seen": 19747664, + "step": 30260 + }, + { + "epoch": 17.84492924528302, + "grad_norm": 5.196811199188232, + "learning_rate": 3.497248035157602e-07, + "loss": 0.3268, + "num_input_tokens_seen": 19751184, + "step": 30265 + }, + { + "epoch": 17.847877358490567, + "grad_norm": 4.497313022613525, + "learning_rate": 3.4878015194314773e-07, + "loss": 0.4222, + "num_input_tokens_seen": 19754800, + "step": 30270 + }, + { + "epoch": 17.850825471698112, + "grad_norm": 3.775904655456543, + "learning_rate": 3.4783673180103617e-07, + "loss": 0.3148, + "num_input_tokens_seen": 19757904, + "step": 30275 + }, + { + "epoch": 17.85377358490566, + "grad_norm": 2.69215989112854, + "learning_rate": 3.468945433391985e-07, + "loss": 0.5146, + "num_input_tokens_seen": 19761392, + "step": 30280 + }, + { + "epoch": 17.85672169811321, + "grad_norm": 5.309360980987549, + "learning_rate": 3.459535868070851e-07, + "loss": 0.2667, + "num_input_tokens_seen": 19763568, + "step": 30285 + }, + { + "epoch": 17.859669811320753, + "grad_norm": 7.470308303833008, + "learning_rate": 3.450138624538174e-07, + "loss": 0.3641, + "num_input_tokens_seen": 19766704, + "step": 30290 + }, + { + "epoch": 17.8626179245283, + "grad_norm": 3.211538553237915, + "learning_rate": 3.440753705281913e-07, + "loss": 0.2482, + "num_input_tokens_seen": 19769904, + "step": 30295 + }, + { + "epoch": 17.86556603773585, + "grad_norm": 5.239882469177246, + "learning_rate": 3.4313811127867693e-07, + "loss": 0.3732, + "num_input_tokens_seen": 19772720, + "step": 30300 + }, + { + "epoch": 17.868514150943398, + "grad_norm": 3.390812397003174, + "learning_rate": 3.4220208495341745e-07, + "loss": 0.3046, + "num_input_tokens_seen": 19776144, + "step": 30305 + }, + { + "epoch": 17.871462264150942, + "grad_norm": 2.789263963699341, + "learning_rate": 3.412672918002291e-07, + "loss": 0.4016, + "num_input_tokens_seen": 19779824, + "step": 30310 + }, + { + "epoch": 17.87441037735849, + "grad_norm": 3.523062229156494, + "learning_rate": 3.403337320666045e-07, + "loss": 0.236, + "num_input_tokens_seen": 19782832, + "step": 30315 + }, + { + "epoch": 17.87735849056604, + "grad_norm": 3.138392925262451, + "learning_rate": 3.394014059997064e-07, + "loss": 0.2286, + "num_input_tokens_seen": 19786192, + "step": 30320 + }, + { + "epoch": 17.880306603773583, + "grad_norm": 4.906579494476318, + "learning_rate": 3.3847031384637185e-07, + "loss": 0.3561, + "num_input_tokens_seen": 19790736, + "step": 30325 + }, + { + "epoch": 17.88325471698113, + "grad_norm": 3.9168195724487305, + "learning_rate": 3.3754045585311147e-07, + "loss": 0.3541, + "num_input_tokens_seen": 19793584, + "step": 30330 + }, + { + "epoch": 17.88620283018868, + "grad_norm": 2.9901039600372314, + "learning_rate": 3.366118322661094e-07, + "loss": 0.2574, + "num_input_tokens_seen": 19798832, + "step": 30335 + }, + { + "epoch": 17.889150943396228, + "grad_norm": 3.9961998462677, + "learning_rate": 3.3568444333122283e-07, + "loss": 0.3633, + "num_input_tokens_seen": 19801936, + "step": 30340 + }, + { + "epoch": 17.892099056603772, + "grad_norm": 4.012005805969238, + "learning_rate": 3.347582892939816e-07, + "loss": 0.3435, + "num_input_tokens_seen": 19805648, + "step": 30345 + }, + { + "epoch": 17.89504716981132, + "grad_norm": 7.200450897216797, + "learning_rate": 3.338333703995905e-07, + "loss": 0.4421, + "num_input_tokens_seen": 19808240, + "step": 30350 + }, + { + "epoch": 17.89799528301887, + "grad_norm": 4.785140514373779, + "learning_rate": 3.329096868929238e-07, + "loss": 0.3599, + "num_input_tokens_seen": 19811184, + "step": 30355 + }, + { + "epoch": 17.900943396226417, + "grad_norm": 5.302978515625, + "learning_rate": 3.319872390185325e-07, + "loss": 0.2981, + "num_input_tokens_seen": 19814064, + "step": 30360 + }, + { + "epoch": 17.90389150943396, + "grad_norm": 6.354378700256348, + "learning_rate": 3.3106602702063727e-07, + "loss": 0.3339, + "num_input_tokens_seen": 19816976, + "step": 30365 + }, + { + "epoch": 17.90683962264151, + "grad_norm": 2.7377803325653076, + "learning_rate": 3.3014605114313316e-07, + "loss": 0.2165, + "num_input_tokens_seen": 19820240, + "step": 30370 + }, + { + "epoch": 17.909787735849058, + "grad_norm": 2.375612735748291, + "learning_rate": 3.2922731162958744e-07, + "loss": 0.2161, + "num_input_tokens_seen": 19822608, + "step": 30375 + }, + { + "epoch": 17.912735849056602, + "grad_norm": 3.5113744735717773, + "learning_rate": 3.2830980872324114e-07, + "loss": 0.289, + "num_input_tokens_seen": 19825744, + "step": 30380 + }, + { + "epoch": 17.91568396226415, + "grad_norm": 1.9386707544326782, + "learning_rate": 3.2739354266700775e-07, + "loss": 0.1966, + "num_input_tokens_seen": 19829264, + "step": 30385 + }, + { + "epoch": 17.9186320754717, + "grad_norm": 4.5147247314453125, + "learning_rate": 3.264785137034709e-07, + "loss": 0.2513, + "num_input_tokens_seen": 19831728, + "step": 30390 + }, + { + "epoch": 17.921580188679247, + "grad_norm": 3.4204776287078857, + "learning_rate": 3.2556472207488977e-07, + "loss": 0.2735, + "num_input_tokens_seen": 19835664, + "step": 30395 + }, + { + "epoch": 17.92452830188679, + "grad_norm": 2.9941046237945557, + "learning_rate": 3.246521680231934e-07, + "loss": 0.3221, + "num_input_tokens_seen": 19838672, + "step": 30400 + }, + { + "epoch": 17.92747641509434, + "grad_norm": 3.2318358421325684, + "learning_rate": 3.2374085178998594e-07, + "loss": 0.3461, + "num_input_tokens_seen": 19841808, + "step": 30405 + }, + { + "epoch": 17.930424528301888, + "grad_norm": 3.9075708389282227, + "learning_rate": 3.2283077361654145e-07, + "loss": 0.272, + "num_input_tokens_seen": 19844528, + "step": 30410 + }, + { + "epoch": 17.933372641509433, + "grad_norm": 3.9072682857513428, + "learning_rate": 3.2192193374380677e-07, + "loss": 0.2614, + "num_input_tokens_seen": 19849360, + "step": 30415 + }, + { + "epoch": 17.93632075471698, + "grad_norm": 4.664844989776611, + "learning_rate": 3.210143324124021e-07, + "loss": 0.2683, + "num_input_tokens_seen": 19856400, + "step": 30420 + }, + { + "epoch": 17.93926886792453, + "grad_norm": 4.720576763153076, + "learning_rate": 3.2010796986261805e-07, + "loss": 0.4036, + "num_input_tokens_seen": 19860848, + "step": 30425 + }, + { + "epoch": 17.942216981132077, + "grad_norm": 4.2414398193359375, + "learning_rate": 3.1920284633441713e-07, + "loss": 0.2543, + "num_input_tokens_seen": 19863600, + "step": 30430 + }, + { + "epoch": 17.94516509433962, + "grad_norm": 4.348722457885742, + "learning_rate": 3.1829896206743704e-07, + "loss": 0.2435, + "num_input_tokens_seen": 19866320, + "step": 30435 + }, + { + "epoch": 17.94811320754717, + "grad_norm": 3.7772605419158936, + "learning_rate": 3.173963173009825e-07, + "loss": 0.2821, + "num_input_tokens_seen": 19869040, + "step": 30440 + }, + { + "epoch": 17.951061320754718, + "grad_norm": 4.74680233001709, + "learning_rate": 3.164949122740352e-07, + "loss": 0.3873, + "num_input_tokens_seen": 19871920, + "step": 30445 + }, + { + "epoch": 17.954009433962263, + "grad_norm": 4.48148250579834, + "learning_rate": 3.1559474722524406e-07, + "loss": 0.3104, + "num_input_tokens_seen": 19874704, + "step": 30450 + }, + { + "epoch": 17.95695754716981, + "grad_norm": 3.523751974105835, + "learning_rate": 3.146958223929325e-07, + "loss": 0.2786, + "num_input_tokens_seen": 19877520, + "step": 30455 + }, + { + "epoch": 17.95990566037736, + "grad_norm": 3.61141300201416, + "learning_rate": 3.1379813801509454e-07, + "loss": 0.452, + "num_input_tokens_seen": 19881840, + "step": 30460 + }, + { + "epoch": 17.962853773584907, + "grad_norm": 2.4816577434539795, + "learning_rate": 3.1290169432939556e-07, + "loss": 0.2294, + "num_input_tokens_seen": 19885712, + "step": 30465 + }, + { + "epoch": 17.965801886792452, + "grad_norm": 6.697802543640137, + "learning_rate": 3.120064915731735e-07, + "loss": 0.3337, + "num_input_tokens_seen": 19889328, + "step": 30470 + }, + { + "epoch": 17.96875, + "grad_norm": 2.0659384727478027, + "learning_rate": 3.1111252998343723e-07, + "loss": 0.2596, + "num_input_tokens_seen": 19892944, + "step": 30475 + }, + { + "epoch": 17.971698113207548, + "grad_norm": 4.429078102111816, + "learning_rate": 3.102198097968662e-07, + "loss": 0.3538, + "num_input_tokens_seen": 19896144, + "step": 30480 + }, + { + "epoch": 17.974646226415093, + "grad_norm": 4.173407077789307, + "learning_rate": 3.093283312498124e-07, + "loss": 0.2861, + "num_input_tokens_seen": 19899056, + "step": 30485 + }, + { + "epoch": 17.97759433962264, + "grad_norm": 3.0966267585754395, + "learning_rate": 3.084380945782989e-07, + "loss": 0.3174, + "num_input_tokens_seen": 19902064, + "step": 30490 + }, + { + "epoch": 17.98054245283019, + "grad_norm": 3.1534929275512695, + "learning_rate": 3.0754910001801866e-07, + "loss": 0.2133, + "num_input_tokens_seen": 19905616, + "step": 30495 + }, + { + "epoch": 17.983490566037737, + "grad_norm": 5.631502628326416, + "learning_rate": 3.0666134780433786e-07, + "loss": 0.2998, + "num_input_tokens_seen": 19908336, + "step": 30500 + }, + { + "epoch": 17.986438679245282, + "grad_norm": 7.465195655822754, + "learning_rate": 3.0577483817229306e-07, + "loss": 0.3673, + "num_input_tokens_seen": 19911152, + "step": 30505 + }, + { + "epoch": 17.98938679245283, + "grad_norm": 4.607506275177002, + "learning_rate": 3.0488957135659023e-07, + "loss": 0.3242, + "num_input_tokens_seen": 19914640, + "step": 30510 + }, + { + "epoch": 17.99233490566038, + "grad_norm": 3.0459094047546387, + "learning_rate": 3.040055475916087e-07, + "loss": 0.2564, + "num_input_tokens_seen": 19917936, + "step": 30515 + }, + { + "epoch": 17.995283018867923, + "grad_norm": 3.427809953689575, + "learning_rate": 3.0312276711139675e-07, + "loss": 0.328, + "num_input_tokens_seen": 19921776, + "step": 30520 + }, + { + "epoch": 17.99823113207547, + "grad_norm": 2.3180325031280518, + "learning_rate": 3.0224123014967353e-07, + "loss": 0.3527, + "num_input_tokens_seen": 19924848, + "step": 30525 + }, + { + "epoch": 18.0, + "eval_loss": 0.6273570656776428, + "eval_runtime": 19.3797, + "eval_samples_per_second": 87.514, + "eval_steps_per_second": 21.879, + "num_input_tokens_seen": 19926608, + "step": 30528 + }, + { + "epoch": 18.00117924528302, + "grad_norm": 5.795991897583008, + "learning_rate": 3.013609369398324e-07, + "loss": 0.2776, + "num_input_tokens_seen": 19927888, + "step": 30530 + }, + { + "epoch": 18.004127358490567, + "grad_norm": 3.457123041152954, + "learning_rate": 3.004818877149318e-07, + "loss": 0.2073, + "num_input_tokens_seen": 19930608, + "step": 30535 + }, + { + "epoch": 18.007075471698112, + "grad_norm": 2.05129075050354, + "learning_rate": 2.9960408270770624e-07, + "loss": 0.3069, + "num_input_tokens_seen": 19933872, + "step": 30540 + }, + { + "epoch": 18.01002358490566, + "grad_norm": 4.635818004608154, + "learning_rate": 2.9872752215055755e-07, + "loss": 0.3925, + "num_input_tokens_seen": 19936944, + "step": 30545 + }, + { + "epoch": 18.01297169811321, + "grad_norm": 5.408751010894775, + "learning_rate": 2.9785220627555844e-07, + "loss": 0.3283, + "num_input_tokens_seen": 19939536, + "step": 30550 + }, + { + "epoch": 18.015919811320753, + "grad_norm": 4.959076404571533, + "learning_rate": 2.9697813531445295e-07, + "loss": 0.3485, + "num_input_tokens_seen": 19942512, + "step": 30555 + }, + { + "epoch": 18.0188679245283, + "grad_norm": 4.398247241973877, + "learning_rate": 2.9610530949865433e-07, + "loss": 0.2613, + "num_input_tokens_seen": 19945840, + "step": 30560 + }, + { + "epoch": 18.02181603773585, + "grad_norm": 3.203413486480713, + "learning_rate": 2.952337290592483e-07, + "loss": 0.3541, + "num_input_tokens_seen": 19948688, + "step": 30565 + }, + { + "epoch": 18.024764150943398, + "grad_norm": 3.4848127365112305, + "learning_rate": 2.9436339422698913e-07, + "loss": 0.2613, + "num_input_tokens_seen": 19952464, + "step": 30570 + }, + { + "epoch": 18.027712264150942, + "grad_norm": 3.749217987060547, + "learning_rate": 2.934943052323008e-07, + "loss": 0.359, + "num_input_tokens_seen": 19956112, + "step": 30575 + }, + { + "epoch": 18.03066037735849, + "grad_norm": 4.3835768699646, + "learning_rate": 2.926264623052799e-07, + "loss": 0.2984, + "num_input_tokens_seen": 19959472, + "step": 30580 + }, + { + "epoch": 18.03360849056604, + "grad_norm": 4.386301040649414, + "learning_rate": 2.9175986567569036e-07, + "loss": 0.2322, + "num_input_tokens_seen": 19962224, + "step": 30585 + }, + { + "epoch": 18.036556603773583, + "grad_norm": 7.671792507171631, + "learning_rate": 2.9089451557296755e-07, + "loss": 0.259, + "num_input_tokens_seen": 19965008, + "step": 30590 + }, + { + "epoch": 18.03950471698113, + "grad_norm": 3.2223095893859863, + "learning_rate": 2.9003041222621706e-07, + "loss": 0.4284, + "num_input_tokens_seen": 19968624, + "step": 30595 + }, + { + "epoch": 18.04245283018868, + "grad_norm": 3.1935136318206787, + "learning_rate": 2.8916755586421375e-07, + "loss": 0.3152, + "num_input_tokens_seen": 19971728, + "step": 30600 + }, + { + "epoch": 18.045400943396228, + "grad_norm": 2.774658441543579, + "learning_rate": 2.883059467154031e-07, + "loss": 0.2578, + "num_input_tokens_seen": 19974768, + "step": 30605 + }, + { + "epoch": 18.048349056603772, + "grad_norm": 3.2750120162963867, + "learning_rate": 2.8744558500789887e-07, + "loss": 0.3173, + "num_input_tokens_seen": 19978384, + "step": 30610 + }, + { + "epoch": 18.05129716981132, + "grad_norm": 5.315641403198242, + "learning_rate": 2.8658647096948546e-07, + "loss": 0.4031, + "num_input_tokens_seen": 19981872, + "step": 30615 + }, + { + "epoch": 18.05424528301887, + "grad_norm": 7.541603088378906, + "learning_rate": 2.8572860482761813e-07, + "loss": 0.2739, + "num_input_tokens_seen": 19985232, + "step": 30620 + }, + { + "epoch": 18.057193396226417, + "grad_norm": 3.4834625720977783, + "learning_rate": 2.8487198680942017e-07, + "loss": 0.5049, + "num_input_tokens_seen": 19987824, + "step": 30625 + }, + { + "epoch": 18.06014150943396, + "grad_norm": 2.477468490600586, + "learning_rate": 2.840166171416836e-07, + "loss": 0.2201, + "num_input_tokens_seen": 19991184, + "step": 30630 + }, + { + "epoch": 18.06308962264151, + "grad_norm": 3.4069912433624268, + "learning_rate": 2.8316249605087386e-07, + "loss": 0.2935, + "num_input_tokens_seen": 19994672, + "step": 30635 + }, + { + "epoch": 18.066037735849058, + "grad_norm": 4.839682579040527, + "learning_rate": 2.823096237631212e-07, + "loss": 0.2354, + "num_input_tokens_seen": 19997744, + "step": 30640 + }, + { + "epoch": 18.068985849056602, + "grad_norm": 3.7567875385284424, + "learning_rate": 2.814580005042283e-07, + "loss": 0.2568, + "num_input_tokens_seen": 20001904, + "step": 30645 + }, + { + "epoch": 18.07193396226415, + "grad_norm": 3.836216449737549, + "learning_rate": 2.8060762649966435e-07, + "loss": 0.2637, + "num_input_tokens_seen": 20004624, + "step": 30650 + }, + { + "epoch": 18.0748820754717, + "grad_norm": 5.0879082679748535, + "learning_rate": 2.797585019745713e-07, + "loss": 0.3086, + "num_input_tokens_seen": 20007824, + "step": 30655 + }, + { + "epoch": 18.077830188679247, + "grad_norm": 3.0640363693237305, + "learning_rate": 2.789106271537584e-07, + "loss": 0.2621, + "num_input_tokens_seen": 20010896, + "step": 30660 + }, + { + "epoch": 18.08077830188679, + "grad_norm": 3.1409871578216553, + "learning_rate": 2.780640022617037e-07, + "loss": 0.1847, + "num_input_tokens_seen": 20014160, + "step": 30665 + }, + { + "epoch": 18.08372641509434, + "grad_norm": 2.538684844970703, + "learning_rate": 2.772186275225547e-07, + "loss": 0.2586, + "num_input_tokens_seen": 20017968, + "step": 30670 + }, + { + "epoch": 18.086674528301888, + "grad_norm": 3.700963258743286, + "learning_rate": 2.7637450316012836e-07, + "loss": 0.2762, + "num_input_tokens_seen": 20020752, + "step": 30675 + }, + { + "epoch": 18.089622641509433, + "grad_norm": 5.596766948699951, + "learning_rate": 2.755316293979088e-07, + "loss": 0.2624, + "num_input_tokens_seen": 20024688, + "step": 30680 + }, + { + "epoch": 18.09257075471698, + "grad_norm": 4.245615482330322, + "learning_rate": 2.7469000645905295e-07, + "loss": 0.3607, + "num_input_tokens_seen": 20028016, + "step": 30685 + }, + { + "epoch": 18.09551886792453, + "grad_norm": 4.443362236022949, + "learning_rate": 2.738496345663827e-07, + "loss": 0.3788, + "num_input_tokens_seen": 20031248, + "step": 30690 + }, + { + "epoch": 18.098466981132077, + "grad_norm": 4.10257625579834, + "learning_rate": 2.7301051394239e-07, + "loss": 0.2775, + "num_input_tokens_seen": 20034128, + "step": 30695 + }, + { + "epoch": 18.10141509433962, + "grad_norm": 2.2175498008728027, + "learning_rate": 2.72172644809236e-07, + "loss": 0.1811, + "num_input_tokens_seen": 20038032, + "step": 30700 + }, + { + "epoch": 18.10436320754717, + "grad_norm": 3.892087697982788, + "learning_rate": 2.7133602738875e-07, + "loss": 0.3259, + "num_input_tokens_seen": 20040880, + "step": 30705 + }, + { + "epoch": 18.107311320754718, + "grad_norm": 4.485229969024658, + "learning_rate": 2.7050066190242976e-07, + "loss": 0.3329, + "num_input_tokens_seen": 20044720, + "step": 30710 + }, + { + "epoch": 18.110259433962263, + "grad_norm": 3.0180797576904297, + "learning_rate": 2.696665485714428e-07, + "loss": 0.2293, + "num_input_tokens_seen": 20047888, + "step": 30715 + }, + { + "epoch": 18.11320754716981, + "grad_norm": 3.1001508235931396, + "learning_rate": 2.6883368761662367e-07, + "loss": 0.2933, + "num_input_tokens_seen": 20050896, + "step": 30720 + }, + { + "epoch": 18.11615566037736, + "grad_norm": 4.274606704711914, + "learning_rate": 2.680020792584759e-07, + "loss": 0.321, + "num_input_tokens_seen": 20053648, + "step": 30725 + }, + { + "epoch": 18.119103773584907, + "grad_norm": 6.149851322174072, + "learning_rate": 2.6717172371717113e-07, + "loss": 0.2552, + "num_input_tokens_seen": 20056368, + "step": 30730 + }, + { + "epoch": 18.122051886792452, + "grad_norm": 5.6037468910217285, + "learning_rate": 2.663426212125503e-07, + "loss": 0.3099, + "num_input_tokens_seen": 20058672, + "step": 30735 + }, + { + "epoch": 18.125, + "grad_norm": 3.153165817260742, + "learning_rate": 2.655147719641216e-07, + "loss": 0.4385, + "num_input_tokens_seen": 20061936, + "step": 30740 + }, + { + "epoch": 18.127948113207548, + "grad_norm": 4.360229969024658, + "learning_rate": 2.646881761910602e-07, + "loss": 0.2967, + "num_input_tokens_seen": 20064560, + "step": 30745 + }, + { + "epoch": 18.130896226415093, + "grad_norm": 3.8156898021698, + "learning_rate": 2.638628341122135e-07, + "loss": 0.3224, + "num_input_tokens_seen": 20068112, + "step": 30750 + }, + { + "epoch": 18.13384433962264, + "grad_norm": 6.548239707946777, + "learning_rate": 2.6303874594609314e-07, + "loss": 0.2529, + "num_input_tokens_seen": 20070960, + "step": 30755 + }, + { + "epoch": 18.13679245283019, + "grad_norm": 5.625372886657715, + "learning_rate": 2.622159119108797e-07, + "loss": 0.2808, + "num_input_tokens_seen": 20074224, + "step": 30760 + }, + { + "epoch": 18.139740566037737, + "grad_norm": 2.3498785495758057, + "learning_rate": 2.6139433222442226e-07, + "loss": 0.2716, + "num_input_tokens_seen": 20077648, + "step": 30765 + }, + { + "epoch": 18.142688679245282, + "grad_norm": 3.6622071266174316, + "learning_rate": 2.6057400710423787e-07, + "loss": 0.181, + "num_input_tokens_seen": 20081104, + "step": 30770 + }, + { + "epoch": 18.14563679245283, + "grad_norm": 3.964223623275757, + "learning_rate": 2.5975493676751004e-07, + "loss": 0.3386, + "num_input_tokens_seen": 20083728, + "step": 30775 + }, + { + "epoch": 18.14858490566038, + "grad_norm": 2.6674911975860596, + "learning_rate": 2.589371214310926e-07, + "loss": 0.3307, + "num_input_tokens_seen": 20086832, + "step": 30780 + }, + { + "epoch": 18.151533018867923, + "grad_norm": 3.323371410369873, + "learning_rate": 2.581205613115051e-07, + "loss": 0.2589, + "num_input_tokens_seen": 20090992, + "step": 30785 + }, + { + "epoch": 18.15448113207547, + "grad_norm": 4.968255519866943, + "learning_rate": 2.573052566249357e-07, + "loss": 0.3204, + "num_input_tokens_seen": 20093776, + "step": 30790 + }, + { + "epoch": 18.15742924528302, + "grad_norm": 3.695032835006714, + "learning_rate": 2.5649120758723945e-07, + "loss": 0.3867, + "num_input_tokens_seen": 20097232, + "step": 30795 + }, + { + "epoch": 18.160377358490567, + "grad_norm": 2.4645185470581055, + "learning_rate": 2.5567841441393906e-07, + "loss": 0.2212, + "num_input_tokens_seen": 20101328, + "step": 30800 + }, + { + "epoch": 18.163325471698112, + "grad_norm": 3.7863998413085938, + "learning_rate": 2.548668773202245e-07, + "loss": 0.2554, + "num_input_tokens_seen": 20103952, + "step": 30805 + }, + { + "epoch": 18.16627358490566, + "grad_norm": 3.025130033493042, + "learning_rate": 2.5405659652095573e-07, + "loss": 0.365, + "num_input_tokens_seen": 20106736, + "step": 30810 + }, + { + "epoch": 18.16922169811321, + "grad_norm": 3.5622360706329346, + "learning_rate": 2.5324757223065655e-07, + "loss": 0.2938, + "num_input_tokens_seen": 20109808, + "step": 30815 + }, + { + "epoch": 18.172169811320753, + "grad_norm": 5.007580757141113, + "learning_rate": 2.524398046635207e-07, + "loss": 0.5024, + "num_input_tokens_seen": 20113648, + "step": 30820 + }, + { + "epoch": 18.1751179245283, + "grad_norm": 2.71625018119812, + "learning_rate": 2.51633294033406e-07, + "loss": 0.2344, + "num_input_tokens_seen": 20117328, + "step": 30825 + }, + { + "epoch": 18.17806603773585, + "grad_norm": 3.0770347118377686, + "learning_rate": 2.5082804055384214e-07, + "loss": 0.27, + "num_input_tokens_seen": 20120368, + "step": 30830 + }, + { + "epoch": 18.181014150943398, + "grad_norm": 4.538959503173828, + "learning_rate": 2.50024044438022e-07, + "loss": 0.3116, + "num_input_tokens_seen": 20122928, + "step": 30835 + }, + { + "epoch": 18.183962264150942, + "grad_norm": 3.5060131549835205, + "learning_rate": 2.492213058988069e-07, + "loss": 0.2658, + "num_input_tokens_seen": 20126384, + "step": 30840 + }, + { + "epoch": 18.18691037735849, + "grad_norm": 3.048906087875366, + "learning_rate": 2.4841982514872633e-07, + "loss": 0.1827, + "num_input_tokens_seen": 20129488, + "step": 30845 + }, + { + "epoch": 18.18985849056604, + "grad_norm": 6.240050792694092, + "learning_rate": 2.4761960239997497e-07, + "loss": 0.3116, + "num_input_tokens_seen": 20132176, + "step": 30850 + }, + { + "epoch": 18.192806603773583, + "grad_norm": 3.786304473876953, + "learning_rate": 2.4682063786441556e-07, + "loss": 0.3685, + "num_input_tokens_seen": 20135696, + "step": 30855 + }, + { + "epoch": 18.19575471698113, + "grad_norm": 3.792616367340088, + "learning_rate": 2.460229317535778e-07, + "loss": 0.3455, + "num_input_tokens_seen": 20138576, + "step": 30860 + }, + { + "epoch": 18.19870283018868, + "grad_norm": 2.999385356903076, + "learning_rate": 2.4522648427865725e-07, + "loss": 0.3234, + "num_input_tokens_seen": 20142256, + "step": 30865 + }, + { + "epoch": 18.201650943396228, + "grad_norm": 2.3591747283935547, + "learning_rate": 2.444312956505163e-07, + "loss": 0.285, + "num_input_tokens_seen": 20145808, + "step": 30870 + }, + { + "epoch": 18.204599056603772, + "grad_norm": 2.9386510848999023, + "learning_rate": 2.4363736607968537e-07, + "loss": 0.4278, + "num_input_tokens_seen": 20148432, + "step": 30875 + }, + { + "epoch": 18.20754716981132, + "grad_norm": 5.070833683013916, + "learning_rate": 2.428446957763608e-07, + "loss": 0.2909, + "num_input_tokens_seen": 20150800, + "step": 30880 + }, + { + "epoch": 18.21049528301887, + "grad_norm": 3.2415924072265625, + "learning_rate": 2.4205328495040535e-07, + "loss": 0.2139, + "num_input_tokens_seen": 20155728, + "step": 30885 + }, + { + "epoch": 18.213443396226417, + "grad_norm": 3.8197202682495117, + "learning_rate": 2.412631338113486e-07, + "loss": 0.3497, + "num_input_tokens_seen": 20158608, + "step": 30890 + }, + { + "epoch": 18.21639150943396, + "grad_norm": 4.628184795379639, + "learning_rate": 2.404742425683848e-07, + "loss": 0.4422, + "num_input_tokens_seen": 20161904, + "step": 30895 + }, + { + "epoch": 18.21933962264151, + "grad_norm": 3.5616936683654785, + "learning_rate": 2.3968661143037864e-07, + "loss": 0.4067, + "num_input_tokens_seen": 20165296, + "step": 30900 + }, + { + "epoch": 18.222287735849058, + "grad_norm": 4.433199882507324, + "learning_rate": 2.3890024060585823e-07, + "loss": 0.4141, + "num_input_tokens_seen": 20168400, + "step": 30905 + }, + { + "epoch": 18.225235849056602, + "grad_norm": 4.060523509979248, + "learning_rate": 2.3811513030301826e-07, + "loss": 0.349, + "num_input_tokens_seen": 20172080, + "step": 30910 + }, + { + "epoch": 18.22818396226415, + "grad_norm": 4.877480506896973, + "learning_rate": 2.373312807297201e-07, + "loss": 0.2551, + "num_input_tokens_seen": 20175088, + "step": 30915 + }, + { + "epoch": 18.2311320754717, + "grad_norm": 3.3634192943573, + "learning_rate": 2.3654869209349007e-07, + "loss": 0.2623, + "num_input_tokens_seen": 20178096, + "step": 30920 + }, + { + "epoch": 18.234080188679247, + "grad_norm": 3.5076520442962646, + "learning_rate": 2.357673646015246e-07, + "loss": 0.253, + "num_input_tokens_seen": 20180848, + "step": 30925 + }, + { + "epoch": 18.23702830188679, + "grad_norm": 3.9768006801605225, + "learning_rate": 2.3498729846068103e-07, + "loss": 0.2387, + "num_input_tokens_seen": 20183536, + "step": 30930 + }, + { + "epoch": 18.23997641509434, + "grad_norm": 5.699945449829102, + "learning_rate": 2.342084938774869e-07, + "loss": 0.5658, + "num_input_tokens_seen": 20186000, + "step": 30935 + }, + { + "epoch": 18.242924528301888, + "grad_norm": 7.519649982452393, + "learning_rate": 2.334309510581334e-07, + "loss": 0.3096, + "num_input_tokens_seen": 20188240, + "step": 30940 + }, + { + "epoch": 18.245872641509433, + "grad_norm": 3.784212589263916, + "learning_rate": 2.3265467020847864e-07, + "loss": 0.3215, + "num_input_tokens_seen": 20190512, + "step": 30945 + }, + { + "epoch": 18.24882075471698, + "grad_norm": 3.8966903686523438, + "learning_rate": 2.31879651534046e-07, + "loss": 0.3134, + "num_input_tokens_seen": 20193040, + "step": 30950 + }, + { + "epoch": 18.25176886792453, + "grad_norm": 4.3160176277160645, + "learning_rate": 2.311058952400247e-07, + "loss": 0.4113, + "num_input_tokens_seen": 20196432, + "step": 30955 + }, + { + "epoch": 18.254716981132077, + "grad_norm": 3.0872766971588135, + "learning_rate": 2.3033340153127026e-07, + "loss": 0.3499, + "num_input_tokens_seen": 20198800, + "step": 30960 + }, + { + "epoch": 18.25766509433962, + "grad_norm": 2.901895523071289, + "learning_rate": 2.295621706123041e-07, + "loss": 0.4237, + "num_input_tokens_seen": 20202384, + "step": 30965 + }, + { + "epoch": 18.26061320754717, + "grad_norm": 2.930816888809204, + "learning_rate": 2.287922026873135e-07, + "loss": 0.3298, + "num_input_tokens_seen": 20206832, + "step": 30970 + }, + { + "epoch": 18.263561320754718, + "grad_norm": 6.388247013092041, + "learning_rate": 2.2802349796014923e-07, + "loss": 0.2324, + "num_input_tokens_seen": 20209904, + "step": 30975 + }, + { + "epoch": 18.266509433962263, + "grad_norm": 3.3831570148468018, + "learning_rate": 2.2725605663433013e-07, + "loss": 0.1808, + "num_input_tokens_seen": 20213712, + "step": 30980 + }, + { + "epoch": 18.26945754716981, + "grad_norm": 3.8719987869262695, + "learning_rate": 2.264898789130393e-07, + "loss": 0.2718, + "num_input_tokens_seen": 20216976, + "step": 30985 + }, + { + "epoch": 18.27240566037736, + "grad_norm": 2.7047226428985596, + "learning_rate": 2.2572496499912554e-07, + "loss": 0.3002, + "num_input_tokens_seen": 20219408, + "step": 30990 + }, + { + "epoch": 18.275353773584907, + "grad_norm": 2.986130475997925, + "learning_rate": 2.2496131509510354e-07, + "loss": 0.3919, + "num_input_tokens_seen": 20226672, + "step": 30995 + }, + { + "epoch": 18.278301886792452, + "grad_norm": 3.546712875366211, + "learning_rate": 2.2419892940315268e-07, + "loss": 0.2783, + "num_input_tokens_seen": 20230448, + "step": 31000 + }, + { + "epoch": 18.28125, + "grad_norm": 4.5099778175354, + "learning_rate": 2.2343780812511819e-07, + "loss": 0.3248, + "num_input_tokens_seen": 20233936, + "step": 31005 + }, + { + "epoch": 18.284198113207548, + "grad_norm": 1.9003348350524902, + "learning_rate": 2.2267795146250936e-07, + "loss": 0.3385, + "num_input_tokens_seen": 20237232, + "step": 31010 + }, + { + "epoch": 18.287146226415093, + "grad_norm": 4.365650177001953, + "learning_rate": 2.2191935961650146e-07, + "loss": 0.3543, + "num_input_tokens_seen": 20240112, + "step": 31015 + }, + { + "epoch": 18.29009433962264, + "grad_norm": 5.9216413497924805, + "learning_rate": 2.2116203278793603e-07, + "loss": 0.4193, + "num_input_tokens_seen": 20243856, + "step": 31020 + }, + { + "epoch": 18.29304245283019, + "grad_norm": 1.963316559791565, + "learning_rate": 2.2040597117731766e-07, + "loss": 0.3104, + "num_input_tokens_seen": 20246704, + "step": 31025 + }, + { + "epoch": 18.295990566037737, + "grad_norm": 3.0110862255096436, + "learning_rate": 2.1965117498481793e-07, + "loss": 0.1606, + "num_input_tokens_seen": 20252592, + "step": 31030 + }, + { + "epoch": 18.298938679245282, + "grad_norm": 3.420036792755127, + "learning_rate": 2.188976444102714e-07, + "loss": 0.3302, + "num_input_tokens_seen": 20256976, + "step": 31035 + }, + { + "epoch": 18.30188679245283, + "grad_norm": 2.4606235027313232, + "learning_rate": 2.181453796531796e-07, + "loss": 0.3509, + "num_input_tokens_seen": 20260464, + "step": 31040 + }, + { + "epoch": 18.30483490566038, + "grad_norm": 5.958358287811279, + "learning_rate": 2.1739438091270658e-07, + "loss": 0.2147, + "num_input_tokens_seen": 20262864, + "step": 31045 + }, + { + "epoch": 18.307783018867923, + "grad_norm": 4.798393249511719, + "learning_rate": 2.1664464838768329e-07, + "loss": 0.2624, + "num_input_tokens_seen": 20265744, + "step": 31050 + }, + { + "epoch": 18.31073113207547, + "grad_norm": 4.676506042480469, + "learning_rate": 2.1589618227660426e-07, + "loss": 0.2706, + "num_input_tokens_seen": 20268272, + "step": 31055 + }, + { + "epoch": 18.31367924528302, + "grad_norm": 5.121920108795166, + "learning_rate": 2.151489827776293e-07, + "loss": 0.3173, + "num_input_tokens_seen": 20271088, + "step": 31060 + }, + { + "epoch": 18.316627358490567, + "grad_norm": 3.1759066581726074, + "learning_rate": 2.1440305008858298e-07, + "loss": 0.3216, + "num_input_tokens_seen": 20273680, + "step": 31065 + }, + { + "epoch": 18.319575471698112, + "grad_norm": 3.0045599937438965, + "learning_rate": 2.1365838440695397e-07, + "loss": 0.2761, + "num_input_tokens_seen": 20277232, + "step": 31070 + }, + { + "epoch": 18.32252358490566, + "grad_norm": 6.1052093505859375, + "learning_rate": 2.129149859298957e-07, + "loss": 0.4049, + "num_input_tokens_seen": 20280816, + "step": 31075 + }, + { + "epoch": 18.32547169811321, + "grad_norm": 3.8784468173980713, + "learning_rate": 2.1217285485422622e-07, + "loss": 0.2688, + "num_input_tokens_seen": 20283920, + "step": 31080 + }, + { + "epoch": 18.328419811320753, + "grad_norm": 6.382956027984619, + "learning_rate": 2.114319913764268e-07, + "loss": 0.3839, + "num_input_tokens_seen": 20287472, + "step": 31085 + }, + { + "epoch": 18.3313679245283, + "grad_norm": 3.877950429916382, + "learning_rate": 2.10692395692646e-07, + "loss": 0.2511, + "num_input_tokens_seen": 20290448, + "step": 31090 + }, + { + "epoch": 18.33431603773585, + "grad_norm": 4.273087024688721, + "learning_rate": 2.0995406799869444e-07, + "loss": 0.2557, + "num_input_tokens_seen": 20294448, + "step": 31095 + }, + { + "epoch": 18.337264150943398, + "grad_norm": 2.5536012649536133, + "learning_rate": 2.0921700849004743e-07, + "loss": 0.307, + "num_input_tokens_seen": 20298256, + "step": 31100 + }, + { + "epoch": 18.340212264150942, + "grad_norm": 5.228384017944336, + "learning_rate": 2.084812173618439e-07, + "loss": 0.2463, + "num_input_tokens_seen": 20301456, + "step": 31105 + }, + { + "epoch": 18.34316037735849, + "grad_norm": 3.6189115047454834, + "learning_rate": 2.0774669480888853e-07, + "loss": 0.2524, + "num_input_tokens_seen": 20304144, + "step": 31110 + }, + { + "epoch": 18.34610849056604, + "grad_norm": 4.273752212524414, + "learning_rate": 2.0701344102564912e-07, + "loss": 0.2215, + "num_input_tokens_seen": 20306480, + "step": 31115 + }, + { + "epoch": 18.349056603773583, + "grad_norm": 4.518650531768799, + "learning_rate": 2.062814562062576e-07, + "loss": 0.2812, + "num_input_tokens_seen": 20309072, + "step": 31120 + }, + { + "epoch": 18.35200471698113, + "grad_norm": 2.799726724624634, + "learning_rate": 2.0555074054451063e-07, + "loss": 0.4939, + "num_input_tokens_seen": 20311856, + "step": 31125 + }, + { + "epoch": 18.35495283018868, + "grad_norm": 2.913865327835083, + "learning_rate": 2.0482129423386843e-07, + "loss": 0.3563, + "num_input_tokens_seen": 20315600, + "step": 31130 + }, + { + "epoch": 18.357900943396228, + "grad_norm": 4.299365997314453, + "learning_rate": 2.040931174674543e-07, + "loss": 0.2881, + "num_input_tokens_seen": 20318480, + "step": 31135 + }, + { + "epoch": 18.360849056603772, + "grad_norm": 4.572053909301758, + "learning_rate": 2.0336621043805682e-07, + "loss": 0.1799, + "num_input_tokens_seen": 20321008, + "step": 31140 + }, + { + "epoch": 18.36379716981132, + "grad_norm": 5.818107604980469, + "learning_rate": 2.0264057333812704e-07, + "loss": 0.2926, + "num_input_tokens_seen": 20323728, + "step": 31145 + }, + { + "epoch": 18.36674528301887, + "grad_norm": 2.7387173175811768, + "learning_rate": 2.0191620635978127e-07, + "loss": 0.2844, + "num_input_tokens_seen": 20327504, + "step": 31150 + }, + { + "epoch": 18.369693396226417, + "grad_norm": 2.655416965484619, + "learning_rate": 2.0119310969479833e-07, + "loss": 0.3416, + "num_input_tokens_seen": 20331088, + "step": 31155 + }, + { + "epoch": 18.37264150943396, + "grad_norm": 3.6547698974609375, + "learning_rate": 2.004712835346212e-07, + "loss": 0.2262, + "num_input_tokens_seen": 20334352, + "step": 31160 + }, + { + "epoch": 18.37558962264151, + "grad_norm": 4.643865585327148, + "learning_rate": 1.99750728070357e-07, + "loss": 0.2664, + "num_input_tokens_seen": 20337456, + "step": 31165 + }, + { + "epoch": 18.378537735849058, + "grad_norm": 4.551769733428955, + "learning_rate": 1.9903144349277536e-07, + "loss": 0.3164, + "num_input_tokens_seen": 20341072, + "step": 31170 + }, + { + "epoch": 18.381485849056602, + "grad_norm": 6.5776567459106445, + "learning_rate": 1.983134299923095e-07, + "loss": 0.2525, + "num_input_tokens_seen": 20343632, + "step": 31175 + }, + { + "epoch": 18.38443396226415, + "grad_norm": 4.07740592956543, + "learning_rate": 1.9759668775905737e-07, + "loss": 0.3141, + "num_input_tokens_seen": 20346928, + "step": 31180 + }, + { + "epoch": 18.3873820754717, + "grad_norm": 3.0550296306610107, + "learning_rate": 1.9688121698277995e-07, + "loss": 0.2668, + "num_input_tokens_seen": 20350032, + "step": 31185 + }, + { + "epoch": 18.390330188679247, + "grad_norm": 3.922240734100342, + "learning_rate": 1.9616701785290015e-07, + "loss": 0.2537, + "num_input_tokens_seen": 20353200, + "step": 31190 + }, + { + "epoch": 18.39327830188679, + "grad_norm": 4.638723373413086, + "learning_rate": 1.954540905585056e-07, + "loss": 0.4038, + "num_input_tokens_seen": 20356080, + "step": 31195 + }, + { + "epoch": 18.39622641509434, + "grad_norm": 4.057901859283447, + "learning_rate": 1.9474243528834757e-07, + "loss": 0.2868, + "num_input_tokens_seen": 20359280, + "step": 31200 + }, + { + "epoch": 18.399174528301888, + "grad_norm": 2.1456689834594727, + "learning_rate": 1.9403205223083866e-07, + "loss": 0.2359, + "num_input_tokens_seen": 20362448, + "step": 31205 + }, + { + "epoch": 18.402122641509433, + "grad_norm": 3.055636167526245, + "learning_rate": 1.9332294157405619e-07, + "loss": 0.3469, + "num_input_tokens_seen": 20365904, + "step": 31210 + }, + { + "epoch": 18.40507075471698, + "grad_norm": 5.299835205078125, + "learning_rate": 1.926151035057411e-07, + "loss": 0.2711, + "num_input_tokens_seen": 20368880, + "step": 31215 + }, + { + "epoch": 18.40801886792453, + "grad_norm": 4.883914470672607, + "learning_rate": 1.9190853821329626e-07, + "loss": 0.2884, + "num_input_tokens_seen": 20371920, + "step": 31220 + }, + { + "epoch": 18.410966981132077, + "grad_norm": 3.625037431716919, + "learning_rate": 1.9120324588378757e-07, + "loss": 0.4157, + "num_input_tokens_seen": 20375440, + "step": 31225 + }, + { + "epoch": 18.41391509433962, + "grad_norm": 4.024853706359863, + "learning_rate": 1.9049922670394461e-07, + "loss": 0.3652, + "num_input_tokens_seen": 20378992, + "step": 31230 + }, + { + "epoch": 18.41686320754717, + "grad_norm": 3.7979280948638916, + "learning_rate": 1.897964808601588e-07, + "loss": 0.2763, + "num_input_tokens_seen": 20381616, + "step": 31235 + }, + { + "epoch": 18.419811320754718, + "grad_norm": 3.5593016147613525, + "learning_rate": 1.8909500853848517e-07, + "loss": 0.2763, + "num_input_tokens_seen": 20384880, + "step": 31240 + }, + { + "epoch": 18.422759433962263, + "grad_norm": 4.1576032638549805, + "learning_rate": 1.8839480992464243e-07, + "loss": 0.4847, + "num_input_tokens_seen": 20388048, + "step": 31245 + }, + { + "epoch": 18.42570754716981, + "grad_norm": 2.9888906478881836, + "learning_rate": 1.8769588520401005e-07, + "loss": 0.3937, + "num_input_tokens_seen": 20391824, + "step": 31250 + }, + { + "epoch": 18.42865566037736, + "grad_norm": 8.12476634979248, + "learning_rate": 1.8699823456163279e-07, + "loss": 0.2707, + "num_input_tokens_seen": 20395280, + "step": 31255 + }, + { + "epoch": 18.431603773584907, + "grad_norm": 9.915144920349121, + "learning_rate": 1.8630185818221514e-07, + "loss": 0.3827, + "num_input_tokens_seen": 20398320, + "step": 31260 + }, + { + "epoch": 18.434551886792452, + "grad_norm": 5.705941677093506, + "learning_rate": 1.856067562501268e-07, + "loss": 0.3208, + "num_input_tokens_seen": 20400944, + "step": 31265 + }, + { + "epoch": 18.4375, + "grad_norm": 5.286247253417969, + "learning_rate": 1.8491292894939837e-07, + "loss": 0.2665, + "num_input_tokens_seen": 20404112, + "step": 31270 + }, + { + "epoch": 18.440448113207548, + "grad_norm": 2.721579074859619, + "learning_rate": 1.8422037646372405e-07, + "loss": 0.2769, + "num_input_tokens_seen": 20408048, + "step": 31275 + }, + { + "epoch": 18.443396226415093, + "grad_norm": 2.894845485687256, + "learning_rate": 1.8352909897645989e-07, + "loss": 0.3773, + "num_input_tokens_seen": 20411792, + "step": 31280 + }, + { + "epoch": 18.44634433962264, + "grad_norm": 2.38879656791687, + "learning_rate": 1.8283909667062448e-07, + "loss": 0.3554, + "num_input_tokens_seen": 20414896, + "step": 31285 + }, + { + "epoch": 18.44929245283019, + "grad_norm": 3.0960772037506104, + "learning_rate": 1.82150369728899e-07, + "loss": 0.2903, + "num_input_tokens_seen": 20417744, + "step": 31290 + }, + { + "epoch": 18.452240566037737, + "grad_norm": 3.889052629470825, + "learning_rate": 1.814629183336275e-07, + "loss": 0.274, + "num_input_tokens_seen": 20421552, + "step": 31295 + }, + { + "epoch": 18.455188679245282, + "grad_norm": 4.070337772369385, + "learning_rate": 1.807767426668139e-07, + "loss": 0.3764, + "num_input_tokens_seen": 20424400, + "step": 31300 + }, + { + "epoch": 18.45813679245283, + "grad_norm": 6.076757907867432, + "learning_rate": 1.8009184291012783e-07, + "loss": 0.2896, + "num_input_tokens_seen": 20427600, + "step": 31305 + }, + { + "epoch": 18.46108490566038, + "grad_norm": 3.4834136962890625, + "learning_rate": 1.7940821924489926e-07, + "loss": 0.3118, + "num_input_tokens_seen": 20431568, + "step": 31310 + }, + { + "epoch": 18.464033018867923, + "grad_norm": 3.16585636138916, + "learning_rate": 1.7872587185212009e-07, + "loss": 0.3353, + "num_input_tokens_seen": 20434704, + "step": 31315 + }, + { + "epoch": 18.46698113207547, + "grad_norm": 4.02031946182251, + "learning_rate": 1.7804480091244524e-07, + "loss": 0.3823, + "num_input_tokens_seen": 20437968, + "step": 31320 + }, + { + "epoch": 18.46992924528302, + "grad_norm": 3.884927272796631, + "learning_rate": 1.7736500660619104e-07, + "loss": 0.3554, + "num_input_tokens_seen": 20441872, + "step": 31325 + }, + { + "epoch": 18.472877358490567, + "grad_norm": 5.149443626403809, + "learning_rate": 1.766864891133352e-07, + "loss": 0.3027, + "num_input_tokens_seen": 20445072, + "step": 31330 + }, + { + "epoch": 18.475825471698112, + "grad_norm": 5.718039512634277, + "learning_rate": 1.7600924861351843e-07, + "loss": 0.3749, + "num_input_tokens_seen": 20451920, + "step": 31335 + }, + { + "epoch": 18.47877358490566, + "grad_norm": 3.3054327964782715, + "learning_rate": 1.7533328528604398e-07, + "loss": 0.2864, + "num_input_tokens_seen": 20456656, + "step": 31340 + }, + { + "epoch": 18.48172169811321, + "grad_norm": 5.711171627044678, + "learning_rate": 1.746585993098754e-07, + "loss": 0.2848, + "num_input_tokens_seen": 20459600, + "step": 31345 + }, + { + "epoch": 18.484669811320753, + "grad_norm": 3.836273193359375, + "learning_rate": 1.7398519086363864e-07, + "loss": 0.2667, + "num_input_tokens_seen": 20462768, + "step": 31350 + }, + { + "epoch": 18.4876179245283, + "grad_norm": 2.85617995262146, + "learning_rate": 1.733130601256211e-07, + "loss": 0.2421, + "num_input_tokens_seen": 20465488, + "step": 31355 + }, + { + "epoch": 18.49056603773585, + "grad_norm": 3.5198850631713867, + "learning_rate": 1.7264220727377323e-07, + "loss": 0.5042, + "num_input_tokens_seen": 20469264, + "step": 31360 + }, + { + "epoch": 18.493514150943398, + "grad_norm": 2.750945568084717, + "learning_rate": 1.7197263248570517e-07, + "loss": 0.2789, + "num_input_tokens_seen": 20472176, + "step": 31365 + }, + { + "epoch": 18.496462264150942, + "grad_norm": 4.666244983673096, + "learning_rate": 1.7130433593869124e-07, + "loss": 0.3625, + "num_input_tokens_seen": 20476048, + "step": 31370 + }, + { + "epoch": 18.49941037735849, + "grad_norm": 3.435650587081909, + "learning_rate": 1.706373178096643e-07, + "loss": 0.2588, + "num_input_tokens_seen": 20479888, + "step": 31375 + }, + { + "epoch": 18.50235849056604, + "grad_norm": 4.002604007720947, + "learning_rate": 1.6997157827522092e-07, + "loss": 0.3334, + "num_input_tokens_seen": 20482896, + "step": 31380 + }, + { + "epoch": 18.505306603773583, + "grad_norm": 3.4299371242523193, + "learning_rate": 1.6930711751161843e-07, + "loss": 0.2593, + "num_input_tokens_seen": 20486352, + "step": 31385 + }, + { + "epoch": 18.50825471698113, + "grad_norm": 3.40209698677063, + "learning_rate": 1.6864393569477556e-07, + "loss": 0.2878, + "num_input_tokens_seen": 20489456, + "step": 31390 + }, + { + "epoch": 18.51120283018868, + "grad_norm": 4.260105133056641, + "learning_rate": 1.6798203300027295e-07, + "loss": 0.2706, + "num_input_tokens_seen": 20493744, + "step": 31395 + }, + { + "epoch": 18.514150943396228, + "grad_norm": 2.823831558227539, + "learning_rate": 1.6732140960335152e-07, + "loss": 0.4022, + "num_input_tokens_seen": 20496400, + "step": 31400 + }, + { + "epoch": 18.517099056603772, + "grad_norm": 4.569741249084473, + "learning_rate": 1.666620656789153e-07, + "loss": 0.411, + "num_input_tokens_seen": 20498768, + "step": 31405 + }, + { + "epoch": 18.52004716981132, + "grad_norm": 7.168444633483887, + "learning_rate": 1.660040014015274e-07, + "loss": 0.2382, + "num_input_tokens_seen": 20501584, + "step": 31410 + }, + { + "epoch": 18.52299528301887, + "grad_norm": 3.0427677631378174, + "learning_rate": 1.6534721694541344e-07, + "loss": 0.1969, + "num_input_tokens_seen": 20504144, + "step": 31415 + }, + { + "epoch": 18.525943396226417, + "grad_norm": 2.8198704719543457, + "learning_rate": 1.6469171248445993e-07, + "loss": 0.3683, + "num_input_tokens_seen": 20507728, + "step": 31420 + }, + { + "epoch": 18.52889150943396, + "grad_norm": 5.25468111038208, + "learning_rate": 1.6403748819221464e-07, + "loss": 0.2864, + "num_input_tokens_seen": 20511472, + "step": 31425 + }, + { + "epoch": 18.53183962264151, + "grad_norm": 3.428649663925171, + "learning_rate": 1.6338454424188632e-07, + "loss": 0.2558, + "num_input_tokens_seen": 20514448, + "step": 31430 + }, + { + "epoch": 18.534787735849058, + "grad_norm": 3.517892360687256, + "learning_rate": 1.6273288080634442e-07, + "loss": 0.3448, + "num_input_tokens_seen": 20517584, + "step": 31435 + }, + { + "epoch": 18.537735849056602, + "grad_norm": 2.573636293411255, + "learning_rate": 1.6208249805811982e-07, + "loss": 0.3152, + "num_input_tokens_seen": 20521456, + "step": 31440 + }, + { + "epoch": 18.54068396226415, + "grad_norm": 3.259951114654541, + "learning_rate": 1.6143339616940423e-07, + "loss": 0.1984, + "num_input_tokens_seen": 20524336, + "step": 31445 + }, + { + "epoch": 18.5436320754717, + "grad_norm": 3.234400749206543, + "learning_rate": 1.6078557531205018e-07, + "loss": 0.2552, + "num_input_tokens_seen": 20527856, + "step": 31450 + }, + { + "epoch": 18.546580188679247, + "grad_norm": 3.0910327434539795, + "learning_rate": 1.601390356575705e-07, + "loss": 0.3231, + "num_input_tokens_seen": 20532080, + "step": 31455 + }, + { + "epoch": 18.54952830188679, + "grad_norm": 2.6766817569732666, + "learning_rate": 1.5949377737713988e-07, + "loss": 0.3175, + "num_input_tokens_seen": 20536560, + "step": 31460 + }, + { + "epoch": 18.55247641509434, + "grad_norm": 4.010452747344971, + "learning_rate": 1.5884980064159338e-07, + "loss": 0.273, + "num_input_tokens_seen": 20539472, + "step": 31465 + }, + { + "epoch": 18.555424528301888, + "grad_norm": 3.956282377243042, + "learning_rate": 1.5820710562142627e-07, + "loss": 0.3347, + "num_input_tokens_seen": 20543728, + "step": 31470 + }, + { + "epoch": 18.558372641509433, + "grad_norm": 3.55106258392334, + "learning_rate": 1.575656924867952e-07, + "loss": 0.325, + "num_input_tokens_seen": 20547248, + "step": 31475 + }, + { + "epoch": 18.56132075471698, + "grad_norm": 2.6266252994537354, + "learning_rate": 1.5692556140751658e-07, + "loss": 0.3612, + "num_input_tokens_seen": 20550608, + "step": 31480 + }, + { + "epoch": 18.56426886792453, + "grad_norm": 5.347181797027588, + "learning_rate": 1.5628671255306706e-07, + "loss": 0.2816, + "num_input_tokens_seen": 20553744, + "step": 31485 + }, + { + "epoch": 18.567216981132077, + "grad_norm": 7.649124622344971, + "learning_rate": 1.556491460925863e-07, + "loss": 0.2924, + "num_input_tokens_seen": 20556016, + "step": 31490 + }, + { + "epoch": 18.57016509433962, + "grad_norm": 5.071175575256348, + "learning_rate": 1.550128621948721e-07, + "loss": 0.2443, + "num_input_tokens_seen": 20559184, + "step": 31495 + }, + { + "epoch": 18.57311320754717, + "grad_norm": 5.96088981628418, + "learning_rate": 1.5437786102838413e-07, + "loss": 0.3123, + "num_input_tokens_seen": 20561904, + "step": 31500 + }, + { + "epoch": 18.576061320754718, + "grad_norm": 3.1630208492279053, + "learning_rate": 1.5374414276124017e-07, + "loss": 0.3126, + "num_input_tokens_seen": 20566448, + "step": 31505 + }, + { + "epoch": 18.579009433962263, + "grad_norm": 3.5068132877349854, + "learning_rate": 1.5311170756122095e-07, + "loss": 0.2926, + "num_input_tokens_seen": 20568880, + "step": 31510 + }, + { + "epoch": 18.58195754716981, + "grad_norm": 3.951460361480713, + "learning_rate": 1.5248055559576647e-07, + "loss": 0.3251, + "num_input_tokens_seen": 20571888, + "step": 31515 + }, + { + "epoch": 18.58490566037736, + "grad_norm": 5.022000312805176, + "learning_rate": 1.5185068703197526e-07, + "loss": 0.4509, + "num_input_tokens_seen": 20574768, + "step": 31520 + }, + { + "epoch": 18.587853773584907, + "grad_norm": 3.242913007736206, + "learning_rate": 1.5122210203661004e-07, + "loss": 0.2427, + "num_input_tokens_seen": 20578032, + "step": 31525 + }, + { + "epoch": 18.590801886792452, + "grad_norm": 3.5637128353118896, + "learning_rate": 1.505948007760899e-07, + "loss": 0.3167, + "num_input_tokens_seen": 20580208, + "step": 31530 + }, + { + "epoch": 18.59375, + "grad_norm": 3.633031129837036, + "learning_rate": 1.4996878341649647e-07, + "loss": 0.4487, + "num_input_tokens_seen": 20583024, + "step": 31535 + }, + { + "epoch": 18.596698113207548, + "grad_norm": 4.795530319213867, + "learning_rate": 1.493440501235699e-07, + "loss": 0.3554, + "num_input_tokens_seen": 20585744, + "step": 31540 + }, + { + "epoch": 18.599646226415093, + "grad_norm": 3.434183120727539, + "learning_rate": 1.487206010627118e-07, + "loss": 0.308, + "num_input_tokens_seen": 20589296, + "step": 31545 + }, + { + "epoch": 18.60259433962264, + "grad_norm": 2.3632426261901855, + "learning_rate": 1.4809843639898124e-07, + "loss": 0.3127, + "num_input_tokens_seen": 20592400, + "step": 31550 + }, + { + "epoch": 18.60554245283019, + "grad_norm": 3.9533703327178955, + "learning_rate": 1.4747755629710093e-07, + "loss": 0.1978, + "num_input_tokens_seen": 20595024, + "step": 31555 + }, + { + "epoch": 18.608490566037737, + "grad_norm": 2.5510294437408447, + "learning_rate": 1.4685796092145045e-07, + "loss": 0.2828, + "num_input_tokens_seen": 20598544, + "step": 31560 + }, + { + "epoch": 18.611438679245282, + "grad_norm": 5.0588788986206055, + "learning_rate": 1.4623965043607135e-07, + "loss": 0.3113, + "num_input_tokens_seen": 20601456, + "step": 31565 + }, + { + "epoch": 18.61438679245283, + "grad_norm": 3.6721224784851074, + "learning_rate": 1.4562262500466273e-07, + "loss": 0.3825, + "num_input_tokens_seen": 20604336, + "step": 31570 + }, + { + "epoch": 18.61733490566038, + "grad_norm": 3.8494021892547607, + "learning_rate": 1.4500688479058556e-07, + "loss": 0.2636, + "num_input_tokens_seen": 20607184, + "step": 31575 + }, + { + "epoch": 18.620283018867923, + "grad_norm": 3.2143402099609375, + "learning_rate": 1.4439242995685943e-07, + "loss": 0.2943, + "num_input_tokens_seen": 20610896, + "step": 31580 + }, + { + "epoch": 18.62323113207547, + "grad_norm": 4.953895568847656, + "learning_rate": 1.4377926066616364e-07, + "loss": 0.3603, + "num_input_tokens_seen": 20613936, + "step": 31585 + }, + { + "epoch": 18.62617924528302, + "grad_norm": 5.222681999206543, + "learning_rate": 1.4316737708083783e-07, + "loss": 0.3972, + "num_input_tokens_seen": 20617136, + "step": 31590 + }, + { + "epoch": 18.629127358490567, + "grad_norm": 3.253453254699707, + "learning_rate": 1.4255677936288127e-07, + "loss": 0.3487, + "num_input_tokens_seen": 20620304, + "step": 31595 + }, + { + "epoch": 18.632075471698112, + "grad_norm": 6.208869934082031, + "learning_rate": 1.4194746767395184e-07, + "loss": 0.3254, + "num_input_tokens_seen": 20623728, + "step": 31600 + }, + { + "epoch": 18.63502358490566, + "grad_norm": 2.1416375637054443, + "learning_rate": 1.4133944217536722e-07, + "loss": 0.2835, + "num_input_tokens_seen": 20627088, + "step": 31605 + }, + { + "epoch": 18.63797169811321, + "grad_norm": 3.497950553894043, + "learning_rate": 1.4073270302810471e-07, + "loss": 0.2135, + "num_input_tokens_seen": 20631088, + "step": 31610 + }, + { + "epoch": 18.640919811320753, + "grad_norm": 4.315011024475098, + "learning_rate": 1.4012725039280084e-07, + "loss": 0.2733, + "num_input_tokens_seen": 20634032, + "step": 31615 + }, + { + "epoch": 18.6438679245283, + "grad_norm": 3.0566840171813965, + "learning_rate": 1.3952308442975292e-07, + "loss": 0.3314, + "num_input_tokens_seen": 20637104, + "step": 31620 + }, + { + "epoch": 18.64681603773585, + "grad_norm": 3.422818183898926, + "learning_rate": 1.3892020529891637e-07, + "loss": 0.2342, + "num_input_tokens_seen": 20640368, + "step": 31625 + }, + { + "epoch": 18.649764150943398, + "grad_norm": 6.178411483764648, + "learning_rate": 1.3831861315990514e-07, + "loss": 0.3761, + "num_input_tokens_seen": 20643056, + "step": 31630 + }, + { + "epoch": 18.652712264150942, + "grad_norm": 2.749654531478882, + "learning_rate": 1.377183081719935e-07, + "loss": 0.3219, + "num_input_tokens_seen": 20647376, + "step": 31635 + }, + { + "epoch": 18.65566037735849, + "grad_norm": 2.8359181880950928, + "learning_rate": 1.3711929049411544e-07, + "loss": 0.2729, + "num_input_tokens_seen": 20650864, + "step": 31640 + }, + { + "epoch": 18.65860849056604, + "grad_norm": 2.852304220199585, + "learning_rate": 1.365215602848624e-07, + "loss": 0.3813, + "num_input_tokens_seen": 20653712, + "step": 31645 + }, + { + "epoch": 18.661556603773583, + "grad_norm": 4.442895889282227, + "learning_rate": 1.3592511770248727e-07, + "loss": 0.3763, + "num_input_tokens_seen": 20656528, + "step": 31650 + }, + { + "epoch": 18.66450471698113, + "grad_norm": 5.450018882751465, + "learning_rate": 1.3532996290490041e-07, + "loss": 0.2725, + "num_input_tokens_seen": 20659440, + "step": 31655 + }, + { + "epoch": 18.66745283018868, + "grad_norm": 3.923981189727783, + "learning_rate": 1.347360960496713e-07, + "loss": 0.2856, + "num_input_tokens_seen": 20662096, + "step": 31660 + }, + { + "epoch": 18.670400943396228, + "grad_norm": 2.3668906688690186, + "learning_rate": 1.3414351729402862e-07, + "loss": 0.289, + "num_input_tokens_seen": 20665808, + "step": 31665 + }, + { + "epoch": 18.673349056603772, + "grad_norm": 3.667651653289795, + "learning_rate": 1.3355222679486025e-07, + "loss": 0.3186, + "num_input_tokens_seen": 20668848, + "step": 31670 + }, + { + "epoch": 18.67629716981132, + "grad_norm": 2.1343894004821777, + "learning_rate": 1.3296222470871367e-07, + "loss": 0.2851, + "num_input_tokens_seen": 20671760, + "step": 31675 + }, + { + "epoch": 18.67924528301887, + "grad_norm": 3.0044713020324707, + "learning_rate": 1.3237351119179287e-07, + "loss": 0.2815, + "num_input_tokens_seen": 20675472, + "step": 31680 + }, + { + "epoch": 18.682193396226417, + "grad_norm": 3.3570733070373535, + "learning_rate": 1.3178608639996425e-07, + "loss": 0.2789, + "num_input_tokens_seen": 20678128, + "step": 31685 + }, + { + "epoch": 18.68514150943396, + "grad_norm": 3.7691731452941895, + "learning_rate": 1.3119995048874957e-07, + "loss": 0.3361, + "num_input_tokens_seen": 20681360, + "step": 31690 + }, + { + "epoch": 18.68808962264151, + "grad_norm": 7.246938705444336, + "learning_rate": 1.3061510361333186e-07, + "loss": 0.3677, + "num_input_tokens_seen": 20684816, + "step": 31695 + }, + { + "epoch": 18.691037735849058, + "grad_norm": 4.411552429199219, + "learning_rate": 1.3003154592855116e-07, + "loss": 0.2802, + "num_input_tokens_seen": 20689232, + "step": 31700 + }, + { + "epoch": 18.693985849056602, + "grad_norm": 2.6474556922912598, + "learning_rate": 1.2944927758890668e-07, + "loss": 0.2515, + "num_input_tokens_seen": 20691664, + "step": 31705 + }, + { + "epoch": 18.69693396226415, + "grad_norm": 5.467953205108643, + "learning_rate": 1.2886829874855733e-07, + "loss": 0.2726, + "num_input_tokens_seen": 20694896, + "step": 31710 + }, + { + "epoch": 18.6998820754717, + "grad_norm": 2.5844666957855225, + "learning_rate": 1.2828860956131894e-07, + "loss": 0.2433, + "num_input_tokens_seen": 20699312, + "step": 31715 + }, + { + "epoch": 18.702830188679247, + "grad_norm": 3.1489176750183105, + "learning_rate": 1.2771021018066765e-07, + "loss": 0.2558, + "num_input_tokens_seen": 20703056, + "step": 31720 + }, + { + "epoch": 18.70577830188679, + "grad_norm": 3.6816179752349854, + "learning_rate": 1.271331007597365e-07, + "loss": 0.2757, + "num_input_tokens_seen": 20705520, + "step": 31725 + }, + { + "epoch": 18.70872641509434, + "grad_norm": 2.58306622505188, + "learning_rate": 1.2655728145131774e-07, + "loss": 0.3836, + "num_input_tokens_seen": 20708784, + "step": 31730 + }, + { + "epoch": 18.711674528301888, + "grad_norm": 2.371227264404297, + "learning_rate": 1.2598275240786105e-07, + "loss": 0.3546, + "num_input_tokens_seen": 20712176, + "step": 31735 + }, + { + "epoch": 18.714622641509433, + "grad_norm": 3.0417706966400146, + "learning_rate": 1.254095137814776e-07, + "loss": 0.3474, + "num_input_tokens_seen": 20715152, + "step": 31740 + }, + { + "epoch": 18.71757075471698, + "grad_norm": 3.8357276916503906, + "learning_rate": 1.2483756572393368e-07, + "loss": 0.2912, + "num_input_tokens_seen": 20718480, + "step": 31745 + }, + { + "epoch": 18.72051886792453, + "grad_norm": 3.006688356399536, + "learning_rate": 1.242669083866549e-07, + "loss": 0.2832, + "num_input_tokens_seen": 20720944, + "step": 31750 + }, + { + "epoch": 18.723466981132077, + "grad_norm": 3.0669965744018555, + "learning_rate": 1.2369754192072537e-07, + "loss": 0.3728, + "num_input_tokens_seen": 20723856, + "step": 31755 + }, + { + "epoch": 18.72641509433962, + "grad_norm": 4.026971340179443, + "learning_rate": 1.231294664768873e-07, + "loss": 0.3143, + "num_input_tokens_seen": 20726928, + "step": 31760 + }, + { + "epoch": 18.72936320754717, + "grad_norm": 3.7815651893615723, + "learning_rate": 1.225626822055409e-07, + "loss": 0.3305, + "num_input_tokens_seen": 20730192, + "step": 31765 + }, + { + "epoch": 18.732311320754718, + "grad_norm": 3.2977066040039062, + "learning_rate": 1.2199718925674508e-07, + "loss": 0.2117, + "num_input_tokens_seen": 20735376, + "step": 31770 + }, + { + "epoch": 18.735259433962263, + "grad_norm": 2.773209571838379, + "learning_rate": 1.2143298778021616e-07, + "loss": 0.2889, + "num_input_tokens_seen": 20739888, + "step": 31775 + }, + { + "epoch": 18.73820754716981, + "grad_norm": 4.491086959838867, + "learning_rate": 1.2087007792532967e-07, + "loss": 0.334, + "num_input_tokens_seen": 20742640, + "step": 31780 + }, + { + "epoch": 18.74115566037736, + "grad_norm": 2.7369511127471924, + "learning_rate": 1.203084598411175e-07, + "loss": 0.2551, + "num_input_tokens_seen": 20745808, + "step": 31785 + }, + { + "epoch": 18.744103773584907, + "grad_norm": 4.866786003112793, + "learning_rate": 1.1974813367627124e-07, + "loss": 0.2814, + "num_input_tokens_seen": 20748592, + "step": 31790 + }, + { + "epoch": 18.747051886792452, + "grad_norm": 2.6254589557647705, + "learning_rate": 1.1918909957913949e-07, + "loss": 0.3338, + "num_input_tokens_seen": 20752048, + "step": 31795 + }, + { + "epoch": 18.75, + "grad_norm": 3.3751771450042725, + "learning_rate": 1.1863135769772827e-07, + "loss": 0.3979, + "num_input_tokens_seen": 20758544, + "step": 31800 + }, + { + "epoch": 18.752948113207548, + "grad_norm": 2.5983822345733643, + "learning_rate": 1.1807490817970279e-07, + "loss": 0.2111, + "num_input_tokens_seen": 20762672, + "step": 31805 + }, + { + "epoch": 18.755896226415093, + "grad_norm": 4.350729465484619, + "learning_rate": 1.1751975117238578e-07, + "loss": 0.2503, + "num_input_tokens_seen": 20765776, + "step": 31810 + }, + { + "epoch": 18.75884433962264, + "grad_norm": 2.7183430194854736, + "learning_rate": 1.1696588682275633e-07, + "loss": 0.2976, + "num_input_tokens_seen": 20768528, + "step": 31815 + }, + { + "epoch": 18.76179245283019, + "grad_norm": 5.421873092651367, + "learning_rate": 1.1641331527745325e-07, + "loss": 0.3963, + "num_input_tokens_seen": 20775248, + "step": 31820 + }, + { + "epoch": 18.764740566037737, + "grad_norm": 3.816894054412842, + "learning_rate": 1.1586203668277229e-07, + "loss": 0.2465, + "num_input_tokens_seen": 20778736, + "step": 31825 + }, + { + "epoch": 18.767688679245282, + "grad_norm": 3.8145275115966797, + "learning_rate": 1.1531205118466615e-07, + "loss": 0.3572, + "num_input_tokens_seen": 20782128, + "step": 31830 + }, + { + "epoch": 18.77063679245283, + "grad_norm": 4.744506359100342, + "learning_rate": 1.1476335892874669e-07, + "loss": 0.3461, + "num_input_tokens_seen": 20785424, + "step": 31835 + }, + { + "epoch": 18.77358490566038, + "grad_norm": 4.102356910705566, + "learning_rate": 1.1421596006028157e-07, + "loss": 0.4265, + "num_input_tokens_seen": 20788976, + "step": 31840 + }, + { + "epoch": 18.776533018867923, + "grad_norm": 6.971306324005127, + "learning_rate": 1.1366985472419823e-07, + "loss": 0.2755, + "num_input_tokens_seen": 20791536, + "step": 31845 + }, + { + "epoch": 18.77948113207547, + "grad_norm": 4.049649715423584, + "learning_rate": 1.1312504306507987e-07, + "loss": 0.2372, + "num_input_tokens_seen": 20795568, + "step": 31850 + }, + { + "epoch": 18.78242924528302, + "grad_norm": 5.1576247215271, + "learning_rate": 1.1258152522716725e-07, + "loss": 0.3405, + "num_input_tokens_seen": 20798832, + "step": 31855 + }, + { + "epoch": 18.785377358490567, + "grad_norm": 4.836615562438965, + "learning_rate": 1.1203930135435914e-07, + "loss": 0.2894, + "num_input_tokens_seen": 20802352, + "step": 31860 + }, + { + "epoch": 18.788325471698112, + "grad_norm": 3.9058573246002197, + "learning_rate": 1.1149837159021238e-07, + "loss": 0.3813, + "num_input_tokens_seen": 20806160, + "step": 31865 + }, + { + "epoch": 18.79127358490566, + "grad_norm": 3.538270950317383, + "learning_rate": 1.1095873607793961e-07, + "loss": 0.2949, + "num_input_tokens_seen": 20809424, + "step": 31870 + }, + { + "epoch": 18.79422169811321, + "grad_norm": 3.828787326812744, + "learning_rate": 1.1042039496041212e-07, + "loss": 0.3164, + "num_input_tokens_seen": 20812624, + "step": 31875 + }, + { + "epoch": 18.797169811320753, + "grad_norm": 4.499998092651367, + "learning_rate": 1.0988334838015812e-07, + "loss": 0.2514, + "num_input_tokens_seen": 20815728, + "step": 31880 + }, + { + "epoch": 18.8001179245283, + "grad_norm": 2.882920026779175, + "learning_rate": 1.0934759647936333e-07, + "loss": 0.3727, + "num_input_tokens_seen": 20819664, + "step": 31885 + }, + { + "epoch": 18.80306603773585, + "grad_norm": 3.671429395675659, + "learning_rate": 1.0881313939986926e-07, + "loss": 0.2504, + "num_input_tokens_seen": 20823280, + "step": 31890 + }, + { + "epoch": 18.806014150943398, + "grad_norm": 4.965364456176758, + "learning_rate": 1.0827997728317662e-07, + "loss": 0.2794, + "num_input_tokens_seen": 20825840, + "step": 31895 + }, + { + "epoch": 18.808962264150942, + "grad_norm": 7.650140762329102, + "learning_rate": 1.0774811027044196e-07, + "loss": 0.3608, + "num_input_tokens_seen": 20828816, + "step": 31900 + }, + { + "epoch": 18.81191037735849, + "grad_norm": 4.418237209320068, + "learning_rate": 1.0721753850247984e-07, + "loss": 0.224, + "num_input_tokens_seen": 20831280, + "step": 31905 + }, + { + "epoch": 18.81485849056604, + "grad_norm": 4.494792461395264, + "learning_rate": 1.0668826211976124e-07, + "loss": 0.1971, + "num_input_tokens_seen": 20834256, + "step": 31910 + }, + { + "epoch": 18.817806603773583, + "grad_norm": 3.804464340209961, + "learning_rate": 1.0616028126241407e-07, + "loss": 0.2567, + "num_input_tokens_seen": 20837264, + "step": 31915 + }, + { + "epoch": 18.82075471698113, + "grad_norm": 4.1853461265563965, + "learning_rate": 1.0563359607022372e-07, + "loss": 0.2677, + "num_input_tokens_seen": 20840208, + "step": 31920 + }, + { + "epoch": 18.82370283018868, + "grad_norm": 6.727773666381836, + "learning_rate": 1.05108206682632e-07, + "loss": 0.2378, + "num_input_tokens_seen": 20844336, + "step": 31925 + }, + { + "epoch": 18.826650943396228, + "grad_norm": 1.8606854677200317, + "learning_rate": 1.0458411323873874e-07, + "loss": 0.2943, + "num_input_tokens_seen": 20847728, + "step": 31930 + }, + { + "epoch": 18.829599056603772, + "grad_norm": 3.531827926635742, + "learning_rate": 1.0406131587729962e-07, + "loss": 0.2771, + "num_input_tokens_seen": 20850928, + "step": 31935 + }, + { + "epoch": 18.83254716981132, + "grad_norm": 5.268754482269287, + "learning_rate": 1.035398147367278e-07, + "loss": 0.3013, + "num_input_tokens_seen": 20853968, + "step": 31940 + }, + { + "epoch": 18.83549528301887, + "grad_norm": 4.493147850036621, + "learning_rate": 1.030196099550923e-07, + "loss": 0.2883, + "num_input_tokens_seen": 20858064, + "step": 31945 + }, + { + "epoch": 18.838443396226417, + "grad_norm": 4.69831657409668, + "learning_rate": 1.0250070167011905e-07, + "loss": 0.2785, + "num_input_tokens_seen": 20860208, + "step": 31950 + }, + { + "epoch": 18.84139150943396, + "grad_norm": 2.76827073097229, + "learning_rate": 1.0198309001919315e-07, + "loss": 0.2631, + "num_input_tokens_seen": 20862864, + "step": 31955 + }, + { + "epoch": 18.84433962264151, + "grad_norm": 5.016300201416016, + "learning_rate": 1.0146677513935277e-07, + "loss": 0.3585, + "num_input_tokens_seen": 20865968, + "step": 31960 + }, + { + "epoch": 18.847287735849058, + "grad_norm": 7.613326549530029, + "learning_rate": 1.0095175716729578e-07, + "loss": 0.275, + "num_input_tokens_seen": 20869200, + "step": 31965 + }, + { + "epoch": 18.850235849056602, + "grad_norm": 4.24086856842041, + "learning_rate": 1.004380362393742e-07, + "loss": 0.3183, + "num_input_tokens_seen": 20871888, + "step": 31970 + }, + { + "epoch": 18.85318396226415, + "grad_norm": 4.29726505279541, + "learning_rate": 9.99256124915987e-08, + "loss": 0.3647, + "num_input_tokens_seen": 20874704, + "step": 31975 + }, + { + "epoch": 18.8561320754717, + "grad_norm": 3.991434335708618, + "learning_rate": 9.941448605963577e-08, + "loss": 0.4206, + "num_input_tokens_seen": 20878160, + "step": 31980 + }, + { + "epoch": 18.859080188679247, + "grad_norm": 3.2095234394073486, + "learning_rate": 9.890465707880715e-08, + "loss": 0.2586, + "num_input_tokens_seen": 20880656, + "step": 31985 + }, + { + "epoch": 18.86202830188679, + "grad_norm": 3.644953489303589, + "learning_rate": 9.839612568409374e-08, + "loss": 0.3039, + "num_input_tokens_seen": 20883760, + "step": 31990 + }, + { + "epoch": 18.86497641509434, + "grad_norm": 3.309516429901123, + "learning_rate": 9.788889201013119e-08, + "loss": 0.2815, + "num_input_tokens_seen": 20886864, + "step": 31995 + }, + { + "epoch": 18.867924528301888, + "grad_norm": 4.729465484619141, + "learning_rate": 9.738295619121097e-08, + "loss": 0.3522, + "num_input_tokens_seen": 20889552, + "step": 32000 + }, + { + "epoch": 18.870872641509433, + "grad_norm": 3.4798028469085693, + "learning_rate": 9.687831836128203e-08, + "loss": 0.2596, + "num_input_tokens_seen": 20893360, + "step": 32005 + }, + { + "epoch": 18.87382075471698, + "grad_norm": 5.643940448760986, + "learning_rate": 9.637497865395029e-08, + "loss": 0.2454, + "num_input_tokens_seen": 20896208, + "step": 32010 + }, + { + "epoch": 18.87676886792453, + "grad_norm": 6.753708362579346, + "learning_rate": 9.587293720247526e-08, + "loss": 0.3027, + "num_input_tokens_seen": 20898896, + "step": 32015 + }, + { + "epoch": 18.879716981132077, + "grad_norm": 5.177978992462158, + "learning_rate": 9.537219413977672e-08, + "loss": 0.2769, + "num_input_tokens_seen": 20902704, + "step": 32020 + }, + { + "epoch": 18.88266509433962, + "grad_norm": 3.4996254444122314, + "learning_rate": 9.487274959842696e-08, + "loss": 0.2813, + "num_input_tokens_seen": 20905712, + "step": 32025 + }, + { + "epoch": 18.88561320754717, + "grad_norm": 4.770746231079102, + "learning_rate": 9.437460371065687e-08, + "loss": 0.3012, + "num_input_tokens_seen": 20908176, + "step": 32030 + }, + { + "epoch": 18.888561320754718, + "grad_norm": 3.8718643188476562, + "learning_rate": 9.387775660835263e-08, + "loss": 0.2973, + "num_input_tokens_seen": 20911760, + "step": 32035 + }, + { + "epoch": 18.891509433962263, + "grad_norm": 7.138391494750977, + "learning_rate": 9.338220842305678e-08, + "loss": 0.2477, + "num_input_tokens_seen": 20914256, + "step": 32040 + }, + { + "epoch": 18.89445754716981, + "grad_norm": 4.266496658325195, + "learning_rate": 9.288795928596661e-08, + "loss": 0.2981, + "num_input_tokens_seen": 20917904, + "step": 32045 + }, + { + "epoch": 18.89740566037736, + "grad_norm": 6.928383827209473, + "learning_rate": 9.239500932793854e-08, + "loss": 0.2732, + "num_input_tokens_seen": 20922288, + "step": 32050 + }, + { + "epoch": 18.900353773584907, + "grad_norm": 3.0919578075408936, + "learning_rate": 9.190335867948263e-08, + "loss": 0.4326, + "num_input_tokens_seen": 20925328, + "step": 32055 + }, + { + "epoch": 18.903301886792452, + "grad_norm": 3.1570050716400146, + "learning_rate": 9.141300747076476e-08, + "loss": 0.4076, + "num_input_tokens_seen": 20928432, + "step": 32060 + }, + { + "epoch": 18.90625, + "grad_norm": 3.9997150897979736, + "learning_rate": 9.092395583160773e-08, + "loss": 0.2425, + "num_input_tokens_seen": 20932496, + "step": 32065 + }, + { + "epoch": 18.909198113207548, + "grad_norm": 5.04506254196167, + "learning_rate": 9.043620389149021e-08, + "loss": 0.1986, + "num_input_tokens_seen": 20935184, + "step": 32070 + }, + { + "epoch": 18.912146226415093, + "grad_norm": 3.264798879623413, + "learning_rate": 8.994975177954723e-08, + "loss": 0.3418, + "num_input_tokens_seen": 20938288, + "step": 32075 + }, + { + "epoch": 18.91509433962264, + "grad_norm": 3.250704050064087, + "learning_rate": 8.946459962456855e-08, + "loss": 0.3366, + "num_input_tokens_seen": 20941648, + "step": 32080 + }, + { + "epoch": 18.91804245283019, + "grad_norm": 4.3506317138671875, + "learning_rate": 8.89807475550003e-08, + "loss": 0.3754, + "num_input_tokens_seen": 20945008, + "step": 32085 + }, + { + "epoch": 18.920990566037737, + "grad_norm": 4.307066440582275, + "learning_rate": 8.849819569894447e-08, + "loss": 0.3541, + "num_input_tokens_seen": 20947792, + "step": 32090 + }, + { + "epoch": 18.923938679245282, + "grad_norm": 2.9909331798553467, + "learning_rate": 8.801694418415884e-08, + "loss": 0.2469, + "num_input_tokens_seen": 20950576, + "step": 32095 + }, + { + "epoch": 18.92688679245283, + "grad_norm": 4.034100532531738, + "learning_rate": 8.753699313805708e-08, + "loss": 0.3381, + "num_input_tokens_seen": 20953840, + "step": 32100 + }, + { + "epoch": 18.92983490566038, + "grad_norm": 4.447424411773682, + "learning_rate": 8.705834268770753e-08, + "loss": 0.2294, + "num_input_tokens_seen": 20956592, + "step": 32105 + }, + { + "epoch": 18.932783018867923, + "grad_norm": 4.128286838531494, + "learning_rate": 8.65809929598349e-08, + "loss": 0.3171, + "num_input_tokens_seen": 20959088, + "step": 32110 + }, + { + "epoch": 18.93573113207547, + "grad_norm": 2.827216148376465, + "learning_rate": 8.610494408082037e-08, + "loss": 0.2669, + "num_input_tokens_seen": 20962576, + "step": 32115 + }, + { + "epoch": 18.93867924528302, + "grad_norm": 1.4719778299331665, + "learning_rate": 8.563019617669977e-08, + "loss": 0.2132, + "num_input_tokens_seen": 20966896, + "step": 32120 + }, + { + "epoch": 18.941627358490567, + "grad_norm": 5.397637367248535, + "learning_rate": 8.51567493731642e-08, + "loss": 0.3379, + "num_input_tokens_seen": 20970608, + "step": 32125 + }, + { + "epoch": 18.944575471698112, + "grad_norm": 4.547111988067627, + "learning_rate": 8.468460379556176e-08, + "loss": 0.2447, + "num_input_tokens_seen": 20973616, + "step": 32130 + }, + { + "epoch": 18.94752358490566, + "grad_norm": 5.0956244468688965, + "learning_rate": 8.421375956889355e-08, + "loss": 0.3532, + "num_input_tokens_seen": 20976272, + "step": 32135 + }, + { + "epoch": 18.95047169811321, + "grad_norm": 3.890078067779541, + "learning_rate": 8.374421681781819e-08, + "loss": 0.2378, + "num_input_tokens_seen": 20979248, + "step": 32140 + }, + { + "epoch": 18.953419811320753, + "grad_norm": 2.777700662612915, + "learning_rate": 8.327597566665013e-08, + "loss": 0.4016, + "num_input_tokens_seen": 20982448, + "step": 32145 + }, + { + "epoch": 18.9563679245283, + "grad_norm": 3.621309995651245, + "learning_rate": 8.280903623935688e-08, + "loss": 0.4091, + "num_input_tokens_seen": 20987120, + "step": 32150 + }, + { + "epoch": 18.95931603773585, + "grad_norm": 3.2641937732696533, + "learning_rate": 8.234339865956342e-08, + "loss": 0.2368, + "num_input_tokens_seen": 20990192, + "step": 32155 + }, + { + "epoch": 18.962264150943398, + "grad_norm": 5.773024082183838, + "learning_rate": 8.187906305054838e-08, + "loss": 0.2239, + "num_input_tokens_seen": 20993264, + "step": 32160 + }, + { + "epoch": 18.965212264150942, + "grad_norm": 4.664160251617432, + "learning_rate": 8.141602953524841e-08, + "loss": 0.2199, + "num_input_tokens_seen": 20995984, + "step": 32165 + }, + { + "epoch": 18.96816037735849, + "grad_norm": 5.944611072540283, + "learning_rate": 8.095429823625212e-08, + "loss": 0.3432, + "num_input_tokens_seen": 20998672, + "step": 32170 + }, + { + "epoch": 18.97110849056604, + "grad_norm": 4.054778575897217, + "learning_rate": 8.04938692758045e-08, + "loss": 0.2767, + "num_input_tokens_seen": 21002256, + "step": 32175 + }, + { + "epoch": 18.974056603773583, + "grad_norm": 4.914705276489258, + "learning_rate": 8.003474277580803e-08, + "loss": 0.344, + "num_input_tokens_seen": 21005616, + "step": 32180 + }, + { + "epoch": 18.97700471698113, + "grad_norm": 3.643676519393921, + "learning_rate": 7.95769188578166e-08, + "loss": 0.3489, + "num_input_tokens_seen": 21008976, + "step": 32185 + }, + { + "epoch": 18.97995283018868, + "grad_norm": 4.52269172668457, + "learning_rate": 7.912039764304213e-08, + "loss": 0.4761, + "num_input_tokens_seen": 21011728, + "step": 32190 + }, + { + "epoch": 18.982900943396228, + "grad_norm": 5.998030662536621, + "learning_rate": 7.866517925235017e-08, + "loss": 0.3162, + "num_input_tokens_seen": 21015312, + "step": 32195 + }, + { + "epoch": 18.985849056603772, + "grad_norm": 5.424129009246826, + "learning_rate": 7.821126380626154e-08, + "loss": 0.2954, + "num_input_tokens_seen": 21018160, + "step": 32200 + }, + { + "epoch": 18.98879716981132, + "grad_norm": 4.437142372131348, + "learning_rate": 7.775865142495286e-08, + "loss": 0.2075, + "num_input_tokens_seen": 21020976, + "step": 32205 + }, + { + "epoch": 18.99174528301887, + "grad_norm": 4.372691631317139, + "learning_rate": 7.730734222825442e-08, + "loss": 0.3183, + "num_input_tokens_seen": 21024240, + "step": 32210 + }, + { + "epoch": 18.994693396226417, + "grad_norm": 4.8950910568237305, + "learning_rate": 7.68573363356534e-08, + "loss": 0.1867, + "num_input_tokens_seen": 21028400, + "step": 32215 + }, + { + "epoch": 18.99764150943396, + "grad_norm": 4.0425896644592285, + "learning_rate": 7.640863386629005e-08, + "loss": 0.2273, + "num_input_tokens_seen": 21032048, + "step": 32220 + }, + { + "epoch": 19.00058962264151, + "grad_norm": 3.6532318592071533, + "learning_rate": 7.59612349389599e-08, + "loss": 0.3804, + "num_input_tokens_seen": 21034792, + "step": 32225 + }, + { + "epoch": 19.003537735849058, + "grad_norm": 4.5206074714660645, + "learning_rate": 7.551513967211433e-08, + "loss": 0.356, + "num_input_tokens_seen": 21037768, + "step": 32230 + }, + { + "epoch": 19.006485849056602, + "grad_norm": 4.272274017333984, + "learning_rate": 7.507034818385883e-08, + "loss": 0.2647, + "num_input_tokens_seen": 21040552, + "step": 32235 + }, + { + "epoch": 19.00943396226415, + "grad_norm": 4.421605587005615, + "learning_rate": 7.462686059195423e-08, + "loss": 0.2797, + "num_input_tokens_seen": 21043368, + "step": 32240 + }, + { + "epoch": 19.0123820754717, + "grad_norm": 4.233822822570801, + "learning_rate": 7.418467701381548e-08, + "loss": 0.3193, + "num_input_tokens_seen": 21045928, + "step": 32245 + }, + { + "epoch": 19.015330188679247, + "grad_norm": 5.534940242767334, + "learning_rate": 7.374379756651285e-08, + "loss": 0.2731, + "num_input_tokens_seen": 21048616, + "step": 32250 + }, + { + "epoch": 19.01827830188679, + "grad_norm": 5.380414009094238, + "learning_rate": 7.330422236677015e-08, + "loss": 0.2961, + "num_input_tokens_seen": 21051304, + "step": 32255 + }, + { + "epoch": 19.02122641509434, + "grad_norm": 3.872614860534668, + "learning_rate": 7.286595153096765e-08, + "loss": 0.4085, + "num_input_tokens_seen": 21054856, + "step": 32260 + }, + { + "epoch": 19.024174528301888, + "grad_norm": 2.8566131591796875, + "learning_rate": 7.242898517513864e-08, + "loss": 0.3108, + "num_input_tokens_seen": 21057832, + "step": 32265 + }, + { + "epoch": 19.027122641509433, + "grad_norm": 5.40977668762207, + "learning_rate": 7.199332341497333e-08, + "loss": 0.2392, + "num_input_tokens_seen": 21060968, + "step": 32270 + }, + { + "epoch": 19.03007075471698, + "grad_norm": 2.8786494731903076, + "learning_rate": 7.155896636581394e-08, + "loss": 0.2418, + "num_input_tokens_seen": 21064232, + "step": 32275 + }, + { + "epoch": 19.03301886792453, + "grad_norm": 5.631832599639893, + "learning_rate": 7.112591414265901e-08, + "loss": 0.2453, + "num_input_tokens_seen": 21066792, + "step": 32280 + }, + { + "epoch": 19.035966981132077, + "grad_norm": 3.814474582672119, + "learning_rate": 7.069416686016018e-08, + "loss": 0.4086, + "num_input_tokens_seen": 21069288, + "step": 32285 + }, + { + "epoch": 19.03891509433962, + "grad_norm": 2.784569263458252, + "learning_rate": 7.026372463262488e-08, + "loss": 0.2938, + "num_input_tokens_seen": 21073544, + "step": 32290 + }, + { + "epoch": 19.04186320754717, + "grad_norm": 3.023380756378174, + "learning_rate": 6.983458757401418e-08, + "loss": 0.307, + "num_input_tokens_seen": 21077096, + "step": 32295 + }, + { + "epoch": 19.044811320754718, + "grad_norm": 2.9062161445617676, + "learning_rate": 6.940675579794443e-08, + "loss": 0.3095, + "num_input_tokens_seen": 21080104, + "step": 32300 + }, + { + "epoch": 19.047759433962263, + "grad_norm": 6.451761722564697, + "learning_rate": 6.898022941768612e-08, + "loss": 0.2692, + "num_input_tokens_seen": 21083304, + "step": 32305 + }, + { + "epoch": 19.05070754716981, + "grad_norm": 3.206260919570923, + "learning_rate": 6.855500854616337e-08, + "loss": 0.3282, + "num_input_tokens_seen": 21086920, + "step": 32310 + }, + { + "epoch": 19.05365566037736, + "grad_norm": 2.1237645149230957, + "learning_rate": 6.813109329595557e-08, + "loss": 0.3242, + "num_input_tokens_seen": 21090952, + "step": 32315 + }, + { + "epoch": 19.056603773584907, + "grad_norm": 6.356142044067383, + "learning_rate": 6.770848377929573e-08, + "loss": 0.3587, + "num_input_tokens_seen": 21095336, + "step": 32320 + }, + { + "epoch": 19.059551886792452, + "grad_norm": 4.190639972686768, + "learning_rate": 6.728718010807156e-08, + "loss": 0.3279, + "num_input_tokens_seen": 21098120, + "step": 32325 + }, + { + "epoch": 19.0625, + "grad_norm": 3.594177484512329, + "learning_rate": 6.68671823938255e-08, + "loss": 0.2359, + "num_input_tokens_seen": 21101096, + "step": 32330 + }, + { + "epoch": 19.065448113207548, + "grad_norm": 5.312868118286133, + "learning_rate": 6.644849074775361e-08, + "loss": 0.3963, + "num_input_tokens_seen": 21104072, + "step": 32335 + }, + { + "epoch": 19.068396226415093, + "grad_norm": 2.4226672649383545, + "learning_rate": 6.603110528070667e-08, + "loss": 0.277, + "num_input_tokens_seen": 21107176, + "step": 32340 + }, + { + "epoch": 19.07134433962264, + "grad_norm": 5.041682720184326, + "learning_rate": 6.561502610318849e-08, + "loss": 0.2613, + "num_input_tokens_seen": 21110056, + "step": 32345 + }, + { + "epoch": 19.07429245283019, + "grad_norm": 3.1307733058929443, + "learning_rate": 6.520025332535762e-08, + "loss": 0.2424, + "num_input_tokens_seen": 21113416, + "step": 32350 + }, + { + "epoch": 19.077240566037737, + "grad_norm": 4.361947536468506, + "learning_rate": 6.47867870570279e-08, + "loss": 0.2071, + "num_input_tokens_seen": 21116648, + "step": 32355 + }, + { + "epoch": 19.080188679245282, + "grad_norm": 2.14378023147583, + "learning_rate": 6.437462740766564e-08, + "loss": 0.2245, + "num_input_tokens_seen": 21120488, + "step": 32360 + }, + { + "epoch": 19.08313679245283, + "grad_norm": 2.909815788269043, + "learning_rate": 6.396377448639246e-08, + "loss": 0.2974, + "num_input_tokens_seen": 21123688, + "step": 32365 + }, + { + "epoch": 19.08608490566038, + "grad_norm": 3.7488393783569336, + "learning_rate": 6.3554228401983e-08, + "loss": 0.2927, + "num_input_tokens_seen": 21126760, + "step": 32370 + }, + { + "epoch": 19.089033018867923, + "grad_norm": 15.224458694458008, + "learning_rate": 6.314598926286663e-08, + "loss": 0.2894, + "num_input_tokens_seen": 21129224, + "step": 32375 + }, + { + "epoch": 19.09198113207547, + "grad_norm": 3.416590929031372, + "learning_rate": 6.273905717712637e-08, + "loss": 0.2526, + "num_input_tokens_seen": 21132072, + "step": 32380 + }, + { + "epoch": 19.09492924528302, + "grad_norm": 2.687267303466797, + "learning_rate": 6.233343225249933e-08, + "loss": 0.3588, + "num_input_tokens_seen": 21135592, + "step": 32385 + }, + { + "epoch": 19.097877358490567, + "grad_norm": 3.487229347229004, + "learning_rate": 6.192911459637519e-08, + "loss": 0.4374, + "num_input_tokens_seen": 21138824, + "step": 32390 + }, + { + "epoch": 19.100825471698112, + "grad_norm": 3.9031264781951904, + "learning_rate": 6.152610431580052e-08, + "loss": 0.2376, + "num_input_tokens_seen": 21141896, + "step": 32395 + }, + { + "epoch": 19.10377358490566, + "grad_norm": 4.132111072540283, + "learning_rate": 6.112440151747389e-08, + "loss": 0.2266, + "num_input_tokens_seen": 21145448, + "step": 32400 + }, + { + "epoch": 19.10672169811321, + "grad_norm": 3.3005239963531494, + "learning_rate": 6.072400630774689e-08, + "loss": 0.3065, + "num_input_tokens_seen": 21148616, + "step": 32405 + }, + { + "epoch": 19.109669811320753, + "grad_norm": 3.4781055450439453, + "learning_rate": 6.032491879262637e-08, + "loss": 0.3904, + "num_input_tokens_seen": 21151816, + "step": 32410 + }, + { + "epoch": 19.1126179245283, + "grad_norm": 4.325366497039795, + "learning_rate": 5.99271390777717e-08, + "loss": 0.2691, + "num_input_tokens_seen": 21155656, + "step": 32415 + }, + { + "epoch": 19.11556603773585, + "grad_norm": 3.52805495262146, + "learning_rate": 5.953066726849865e-08, + "loss": 0.2806, + "num_input_tokens_seen": 21159144, + "step": 32420 + }, + { + "epoch": 19.118514150943398, + "grad_norm": 5.7765045166015625, + "learning_rate": 5.913550346977326e-08, + "loss": 0.2503, + "num_input_tokens_seen": 21164616, + "step": 32425 + }, + { + "epoch": 19.121462264150942, + "grad_norm": 2.5773887634277344, + "learning_rate": 5.874164778621683e-08, + "loss": 0.4119, + "num_input_tokens_seen": 21167176, + "step": 32430 + }, + { + "epoch": 19.12441037735849, + "grad_norm": 5.0169267654418945, + "learning_rate": 5.834910032210539e-08, + "loss": 0.3008, + "num_input_tokens_seen": 21169736, + "step": 32435 + }, + { + "epoch": 19.12735849056604, + "grad_norm": 6.32297945022583, + "learning_rate": 5.795786118136693e-08, + "loss": 0.361, + "num_input_tokens_seen": 21172520, + "step": 32440 + }, + { + "epoch": 19.130306603773583, + "grad_norm": 5.741361141204834, + "learning_rate": 5.756793046758302e-08, + "loss": 0.3234, + "num_input_tokens_seen": 21175048, + "step": 32445 + }, + { + "epoch": 19.13325471698113, + "grad_norm": 4.130574703216553, + "learning_rate": 5.7179308283990544e-08, + "loss": 0.3188, + "num_input_tokens_seen": 21177672, + "step": 32450 + }, + { + "epoch": 19.13620283018868, + "grad_norm": 3.929359197616577, + "learning_rate": 5.679199473347885e-08, + "loss": 0.3806, + "num_input_tokens_seen": 21180712, + "step": 32455 + }, + { + "epoch": 19.139150943396228, + "grad_norm": 4.547061443328857, + "learning_rate": 5.6405989918590366e-08, + "loss": 0.4888, + "num_input_tokens_seen": 21184872, + "step": 32460 + }, + { + "epoch": 19.142099056603772, + "grad_norm": 9.877241134643555, + "learning_rate": 5.6021293941522225e-08, + "loss": 0.3319, + "num_input_tokens_seen": 21187784, + "step": 32465 + }, + { + "epoch": 19.14504716981132, + "grad_norm": 4.080076694488525, + "learning_rate": 5.563790690412352e-08, + "loss": 0.2975, + "num_input_tokens_seen": 21191656, + "step": 32470 + }, + { + "epoch": 19.14799528301887, + "grad_norm": 2.6291699409484863, + "learning_rate": 5.525582890789805e-08, + "loss": 0.2556, + "num_input_tokens_seen": 21194792, + "step": 32475 + }, + { + "epoch": 19.150943396226417, + "grad_norm": 4.920339107513428, + "learning_rate": 5.4875060054002115e-08, + "loss": 0.3273, + "num_input_tokens_seen": 21197640, + "step": 32480 + }, + { + "epoch": 19.15389150943396, + "grad_norm": 3.7789671421051025, + "learning_rate": 5.4495600443246755e-08, + "loss": 0.3259, + "num_input_tokens_seen": 21202056, + "step": 32485 + }, + { + "epoch": 19.15683962264151, + "grad_norm": 5.435826301574707, + "learning_rate": 5.411745017609493e-08, + "loss": 0.4404, + "num_input_tokens_seen": 21205192, + "step": 32490 + }, + { + "epoch": 19.159787735849058, + "grad_norm": 2.7826404571533203, + "learning_rate": 5.374060935266434e-08, + "loss": 0.1705, + "num_input_tokens_seen": 21207880, + "step": 32495 + }, + { + "epoch": 19.162735849056602, + "grad_norm": 4.484762668609619, + "learning_rate": 5.3365078072724065e-08, + "loss": 0.2481, + "num_input_tokens_seen": 21211112, + "step": 32500 + }, + { + "epoch": 19.16568396226415, + "grad_norm": 3.878443479537964, + "learning_rate": 5.299085643569846e-08, + "loss": 0.306, + "num_input_tokens_seen": 21214184, + "step": 32505 + }, + { + "epoch": 19.1686320754717, + "grad_norm": 3.3356823921203613, + "learning_rate": 5.261794454066327e-08, + "loss": 0.4544, + "num_input_tokens_seen": 21217128, + "step": 32510 + }, + { + "epoch": 19.171580188679247, + "grad_norm": 3.0034141540527344, + "learning_rate": 5.224634248635008e-08, + "loss": 0.2095, + "num_input_tokens_seen": 21220616, + "step": 32515 + }, + { + "epoch": 19.17452830188679, + "grad_norm": 5.759814739227295, + "learning_rate": 5.187605037114129e-08, + "loss": 0.2812, + "num_input_tokens_seen": 21224168, + "step": 32520 + }, + { + "epoch": 19.17747641509434, + "grad_norm": 3.05263352394104, + "learning_rate": 5.15070682930735e-08, + "loss": 0.3054, + "num_input_tokens_seen": 21227944, + "step": 32525 + }, + { + "epoch": 19.180424528301888, + "grad_norm": 3.8854377269744873, + "learning_rate": 5.113939634983578e-08, + "loss": 0.3258, + "num_input_tokens_seen": 21230184, + "step": 32530 + }, + { + "epoch": 19.183372641509433, + "grad_norm": 3.7161104679107666, + "learning_rate": 5.077303463877192e-08, + "loss": 0.266, + "num_input_tokens_seen": 21233768, + "step": 32535 + }, + { + "epoch": 19.18632075471698, + "grad_norm": 4.1021833419799805, + "learning_rate": 5.040798325687601e-08, + "loss": 0.4034, + "num_input_tokens_seen": 21237224, + "step": 32540 + }, + { + "epoch": 19.18926886792453, + "grad_norm": 2.2263023853302, + "learning_rate": 5.004424230079852e-08, + "loss": 0.2871, + "num_input_tokens_seen": 21240392, + "step": 32545 + }, + { + "epoch": 19.192216981132077, + "grad_norm": 4.2272467613220215, + "learning_rate": 4.968181186684129e-08, + "loss": 0.2167, + "num_input_tokens_seen": 21243880, + "step": 32550 + }, + { + "epoch": 19.19516509433962, + "grad_norm": 3.266697406768799, + "learning_rate": 4.932069205095924e-08, + "loss": 0.3156, + "num_input_tokens_seen": 21247592, + "step": 32555 + }, + { + "epoch": 19.19811320754717, + "grad_norm": 5.634979724884033, + "learning_rate": 4.896088294875978e-08, + "loss": 0.3853, + "num_input_tokens_seen": 21250056, + "step": 32560 + }, + { + "epoch": 19.201061320754718, + "grad_norm": 5.005356311798096, + "learning_rate": 4.8602384655505044e-08, + "loss": 0.3021, + "num_input_tokens_seen": 21252872, + "step": 32565 + }, + { + "epoch": 19.204009433962263, + "grad_norm": 4.0302324295043945, + "learning_rate": 4.824519726610744e-08, + "loss": 0.3121, + "num_input_tokens_seen": 21256904, + "step": 32570 + }, + { + "epoch": 19.20695754716981, + "grad_norm": 4.050544738769531, + "learning_rate": 4.7889320875135206e-08, + "loss": 0.2636, + "num_input_tokens_seen": 21259848, + "step": 32575 + }, + { + "epoch": 19.20990566037736, + "grad_norm": 15.176353454589844, + "learning_rate": 4.753475557680742e-08, + "loss": 0.3051, + "num_input_tokens_seen": 21262760, + "step": 32580 + }, + { + "epoch": 19.212853773584907, + "grad_norm": 5.9407057762146, + "learning_rate": 4.718150146499734e-08, + "loss": 0.2632, + "num_input_tokens_seen": 21265928, + "step": 32585 + }, + { + "epoch": 19.215801886792452, + "grad_norm": 5.718442440032959, + "learning_rate": 4.682955863323013e-08, + "loss": 0.2068, + "num_input_tokens_seen": 21268936, + "step": 32590 + }, + { + "epoch": 19.21875, + "grad_norm": 3.8035826683044434, + "learning_rate": 4.6478927174684606e-08, + "loss": 0.2969, + "num_input_tokens_seen": 21271912, + "step": 32595 + }, + { + "epoch": 19.221698113207548, + "grad_norm": 9.988144874572754, + "learning_rate": 4.612960718219095e-08, + "loss": 0.273, + "num_input_tokens_seen": 21274536, + "step": 32600 + }, + { + "epoch": 19.224646226415093, + "grad_norm": 3.817896604537964, + "learning_rate": 4.578159874823407e-08, + "loss": 0.2829, + "num_input_tokens_seen": 21278792, + "step": 32605 + }, + { + "epoch": 19.22759433962264, + "grad_norm": 2.4605555534362793, + "learning_rate": 4.5434901964950264e-08, + "loss": 0.2358, + "num_input_tokens_seen": 21281544, + "step": 32610 + }, + { + "epoch": 19.23054245283019, + "grad_norm": 3.4008195400238037, + "learning_rate": 4.508951692412944e-08, + "loss": 0.4405, + "num_input_tokens_seen": 21284712, + "step": 32615 + }, + { + "epoch": 19.233490566037737, + "grad_norm": 3.319037675857544, + "learning_rate": 4.4745443717213455e-08, + "loss": 0.2929, + "num_input_tokens_seen": 21288232, + "step": 32620 + }, + { + "epoch": 19.236438679245282, + "grad_norm": 3.9437336921691895, + "learning_rate": 4.4402682435296666e-08, + "loss": 0.2574, + "num_input_tokens_seen": 21292040, + "step": 32625 + }, + { + "epoch": 19.23938679245283, + "grad_norm": 3.434069871902466, + "learning_rate": 4.406123316912758e-08, + "loss": 0.3215, + "num_input_tokens_seen": 21295528, + "step": 32630 + }, + { + "epoch": 19.24233490566038, + "grad_norm": 2.705620050430298, + "learning_rate": 4.372109600910612e-08, + "loss": 0.1895, + "num_input_tokens_seen": 21298216, + "step": 32635 + }, + { + "epoch": 19.245283018867923, + "grad_norm": 2.950105667114258, + "learning_rate": 4.338227104528414e-08, + "loss": 0.2654, + "num_input_tokens_seen": 21301352, + "step": 32640 + }, + { + "epoch": 19.24823113207547, + "grad_norm": 4.302958965301514, + "learning_rate": 4.304475836736821e-08, + "loss": 0.4293, + "num_input_tokens_seen": 21303720, + "step": 32645 + }, + { + "epoch": 19.25117924528302, + "grad_norm": 7.830981254577637, + "learning_rate": 4.27085580647163e-08, + "loss": 0.405, + "num_input_tokens_seen": 21306760, + "step": 32650 + }, + { + "epoch": 19.254127358490567, + "grad_norm": 3.535897970199585, + "learning_rate": 4.237367022633776e-08, + "loss": 0.342, + "num_input_tokens_seen": 21311752, + "step": 32655 + }, + { + "epoch": 19.257075471698112, + "grad_norm": 4.188340187072754, + "learning_rate": 4.204009494089612e-08, + "loss": 0.2638, + "num_input_tokens_seen": 21314696, + "step": 32660 + }, + { + "epoch": 19.26002358490566, + "grad_norm": 5.6307454109191895, + "learning_rate": 4.170783229670739e-08, + "loss": 0.2289, + "num_input_tokens_seen": 21317960, + "step": 32665 + }, + { + "epoch": 19.26297169811321, + "grad_norm": 4.13720703125, + "learning_rate": 4.137688238173898e-08, + "loss": 0.2801, + "num_input_tokens_seen": 21322024, + "step": 32670 + }, + { + "epoch": 19.265919811320753, + "grad_norm": 2.883984327316284, + "learning_rate": 4.104724528361137e-08, + "loss": 0.2995, + "num_input_tokens_seen": 21325384, + "step": 32675 + }, + { + "epoch": 19.2688679245283, + "grad_norm": 2.080756664276123, + "learning_rate": 4.071892108959752e-08, + "loss": 0.3396, + "num_input_tokens_seen": 21328680, + "step": 32680 + }, + { + "epoch": 19.27181603773585, + "grad_norm": 4.280841827392578, + "learning_rate": 4.039190988662234e-08, + "loss": 0.2063, + "num_input_tokens_seen": 21331432, + "step": 32685 + }, + { + "epoch": 19.274764150943398, + "grad_norm": 3.937467575073242, + "learning_rate": 4.006621176126435e-08, + "loss": 0.2236, + "num_input_tokens_seen": 21334728, + "step": 32690 + }, + { + "epoch": 19.277712264150942, + "grad_norm": 3.7968103885650635, + "learning_rate": 3.974182679975236e-08, + "loss": 0.3599, + "num_input_tokens_seen": 21338376, + "step": 32695 + }, + { + "epoch": 19.28066037735849, + "grad_norm": 3.42622971534729, + "learning_rate": 3.941875508796933e-08, + "loss": 0.2961, + "num_input_tokens_seen": 21341032, + "step": 32700 + }, + { + "epoch": 19.28360849056604, + "grad_norm": 4.56218957901001, + "learning_rate": 3.909699671145017e-08, + "loss": 0.281, + "num_input_tokens_seen": 21344264, + "step": 32705 + }, + { + "epoch": 19.286556603773583, + "grad_norm": 3.310807704925537, + "learning_rate": 3.87765517553812e-08, + "loss": 0.3551, + "num_input_tokens_seen": 21346920, + "step": 32710 + }, + { + "epoch": 19.28950471698113, + "grad_norm": 4.824003219604492, + "learning_rate": 3.8457420304601756e-08, + "loss": 0.3673, + "num_input_tokens_seen": 21349896, + "step": 32715 + }, + { + "epoch": 19.29245283018868, + "grad_norm": 4.598348140716553, + "learning_rate": 3.813960244360371e-08, + "loss": 0.2812, + "num_input_tokens_seen": 21352808, + "step": 32720 + }, + { + "epoch": 19.295400943396228, + "grad_norm": 4.563314914703369, + "learning_rate": 3.7823098256529744e-08, + "loss": 0.3668, + "num_input_tokens_seen": 21356136, + "step": 32725 + }, + { + "epoch": 19.298349056603772, + "grad_norm": 3.006699800491333, + "learning_rate": 3.750790782717673e-08, + "loss": 0.3403, + "num_input_tokens_seen": 21360808, + "step": 32730 + }, + { + "epoch": 19.30129716981132, + "grad_norm": 3.948866128921509, + "learning_rate": 3.719403123899179e-08, + "loss": 0.2883, + "num_input_tokens_seen": 21363464, + "step": 32735 + }, + { + "epoch": 19.30424528301887, + "grad_norm": 2.479295492172241, + "learning_rate": 3.688146857507624e-08, + "loss": 0.2235, + "num_input_tokens_seen": 21368104, + "step": 32740 + }, + { + "epoch": 19.307193396226417, + "grad_norm": 5.731419086456299, + "learning_rate": 3.657021991818166e-08, + "loss": 0.4012, + "num_input_tokens_seen": 21370824, + "step": 32745 + }, + { + "epoch": 19.31014150943396, + "grad_norm": 6.35030460357666, + "learning_rate": 3.626028535071213e-08, + "loss": 0.2299, + "num_input_tokens_seen": 21373512, + "step": 32750 + }, + { + "epoch": 19.31308962264151, + "grad_norm": 2.27323055267334, + "learning_rate": 3.59516649547248e-08, + "loss": 0.2486, + "num_input_tokens_seen": 21376552, + "step": 32755 + }, + { + "epoch": 19.316037735849058, + "grad_norm": 5.354336738586426, + "learning_rate": 3.564435881192818e-08, + "loss": 0.2959, + "num_input_tokens_seen": 21379368, + "step": 32760 + }, + { + "epoch": 19.318985849056602, + "grad_norm": 3.7711942195892334, + "learning_rate": 3.5338367003682763e-08, + "loss": 0.2732, + "num_input_tokens_seen": 21382888, + "step": 32765 + }, + { + "epoch": 19.32193396226415, + "grad_norm": 3.641944408416748, + "learning_rate": 3.5033689611000954e-08, + "loss": 0.2127, + "num_input_tokens_seen": 21385768, + "step": 32770 + }, + { + "epoch": 19.3248820754717, + "grad_norm": 5.2400736808776855, + "learning_rate": 3.473032671454768e-08, + "loss": 0.4212, + "num_input_tokens_seen": 21388904, + "step": 32775 + }, + { + "epoch": 19.327830188679247, + "grad_norm": 4.567250728607178, + "learning_rate": 3.44282783946398e-08, + "loss": 0.2825, + "num_input_tokens_seen": 21391624, + "step": 32780 + }, + { + "epoch": 19.33077830188679, + "grad_norm": 1.8661001920700073, + "learning_rate": 3.4127544731245575e-08, + "loss": 0.3603, + "num_input_tokens_seen": 21395912, + "step": 32785 + }, + { + "epoch": 19.33372641509434, + "grad_norm": 3.29093861579895, + "learning_rate": 3.38281258039852e-08, + "loss": 0.1849, + "num_input_tokens_seen": 21399496, + "step": 32790 + }, + { + "epoch": 19.336674528301888, + "grad_norm": 6.583939075469971, + "learning_rate": 3.353002169213193e-08, + "loss": 0.303, + "num_input_tokens_seen": 21402088, + "step": 32795 + }, + { + "epoch": 19.339622641509433, + "grad_norm": 4.505652904510498, + "learning_rate": 3.32332324746093e-08, + "loss": 0.4208, + "num_input_tokens_seen": 21404936, + "step": 32800 + }, + { + "epoch": 19.34257075471698, + "grad_norm": 3.436218500137329, + "learning_rate": 3.2937758229994455e-08, + "loss": 0.3071, + "num_input_tokens_seen": 21408680, + "step": 32805 + }, + { + "epoch": 19.34551886792453, + "grad_norm": 4.795374870300293, + "learning_rate": 3.2643599036514815e-08, + "loss": 0.2711, + "num_input_tokens_seen": 21412072, + "step": 32810 + }, + { + "epoch": 19.348466981132077, + "grad_norm": 4.39146614074707, + "learning_rate": 3.2350754972050316e-08, + "loss": 0.2704, + "num_input_tokens_seen": 21415016, + "step": 32815 + }, + { + "epoch": 19.35141509433962, + "grad_norm": 2.3008618354797363, + "learning_rate": 3.2059226114132815e-08, + "loss": 0.2713, + "num_input_tokens_seen": 21417928, + "step": 32820 + }, + { + "epoch": 19.35436320754717, + "grad_norm": 2.805619955062866, + "learning_rate": 3.1769012539945575e-08, + "loss": 0.2296, + "num_input_tokens_seen": 21420136, + "step": 32825 + }, + { + "epoch": 19.357311320754718, + "grad_norm": 6.54548978805542, + "learning_rate": 3.1480114326324364e-08, + "loss": 0.3328, + "num_input_tokens_seen": 21422408, + "step": 32830 + }, + { + "epoch": 19.360259433962263, + "grad_norm": 6.361226558685303, + "learning_rate": 3.1192531549756325e-08, + "loss": 0.2794, + "num_input_tokens_seen": 21425352, + "step": 32835 + }, + { + "epoch": 19.36320754716981, + "grad_norm": 4.07490873336792, + "learning_rate": 3.090626428638e-08, + "loss": 0.4428, + "num_input_tokens_seen": 21428392, + "step": 32840 + }, + { + "epoch": 19.36615566037736, + "grad_norm": 3.5838065147399902, + "learning_rate": 3.062131261198531e-08, + "loss": 0.33, + "num_input_tokens_seen": 21431272, + "step": 32845 + }, + { + "epoch": 19.369103773584907, + "grad_norm": 4.106979846954346, + "learning_rate": 3.033767660201525e-08, + "loss": 0.3193, + "num_input_tokens_seen": 21434248, + "step": 32850 + }, + { + "epoch": 19.372051886792452, + "grad_norm": 2.625016450881958, + "learning_rate": 3.005535633156309e-08, + "loss": 0.1988, + "num_input_tokens_seen": 21437320, + "step": 32855 + }, + { + "epoch": 19.375, + "grad_norm": 2.410367488861084, + "learning_rate": 2.977435187537514e-08, + "loss": 0.2124, + "num_input_tokens_seen": 21440552, + "step": 32860 + }, + { + "epoch": 19.377948113207548, + "grad_norm": 3.2358736991882324, + "learning_rate": 2.949466330784745e-08, + "loss": 0.2473, + "num_input_tokens_seen": 21444488, + "step": 32865 + }, + { + "epoch": 19.380896226415093, + "grad_norm": 6.545119285583496, + "learning_rate": 2.921629070302967e-08, + "loss": 0.3404, + "num_input_tokens_seen": 21447304, + "step": 32870 + }, + { + "epoch": 19.38384433962264, + "grad_norm": 5.443304061889648, + "learning_rate": 2.893923413462174e-08, + "loss": 0.3005, + "num_input_tokens_seen": 21450152, + "step": 32875 + }, + { + "epoch": 19.38679245283019, + "grad_norm": 4.565801620483398, + "learning_rate": 2.866349367597554e-08, + "loss": 0.3742, + "num_input_tokens_seen": 21452744, + "step": 32880 + }, + { + "epoch": 19.389740566037737, + "grad_norm": 4.493298053741455, + "learning_rate": 2.8389069400094893e-08, + "loss": 0.3459, + "num_input_tokens_seen": 21457224, + "step": 32885 + }, + { + "epoch": 19.392688679245282, + "grad_norm": 6.825876712799072, + "learning_rate": 2.811596137963446e-08, + "loss": 0.2872, + "num_input_tokens_seen": 21460616, + "step": 32890 + }, + { + "epoch": 19.39563679245283, + "grad_norm": 3.8724637031555176, + "learning_rate": 2.7844169686900844e-08, + "loss": 0.2755, + "num_input_tokens_seen": 21464488, + "step": 32895 + }, + { + "epoch": 19.39858490566038, + "grad_norm": 2.525320291519165, + "learning_rate": 2.7573694393852047e-08, + "loss": 0.2681, + "num_input_tokens_seen": 21467720, + "step": 32900 + }, + { + "epoch": 19.401533018867923, + "grad_norm": 5.503297805786133, + "learning_rate": 2.7304535572098e-08, + "loss": 0.3072, + "num_input_tokens_seen": 21470600, + "step": 32905 + }, + { + "epoch": 19.40448113207547, + "grad_norm": 2.876127243041992, + "learning_rate": 2.703669329289893e-08, + "loss": 0.253, + "num_input_tokens_seen": 21473992, + "step": 32910 + }, + { + "epoch": 19.40742924528302, + "grad_norm": 5.287059783935547, + "learning_rate": 2.6770167627167554e-08, + "loss": 0.4588, + "num_input_tokens_seen": 21477672, + "step": 32915 + }, + { + "epoch": 19.410377358490567, + "grad_norm": 4.27646017074585, + "learning_rate": 2.6504958645467426e-08, + "loss": 0.3745, + "num_input_tokens_seen": 21480648, + "step": 32920 + }, + { + "epoch": 19.413325471698112, + "grad_norm": 4.470993995666504, + "learning_rate": 2.6241066418014605e-08, + "loss": 0.2767, + "num_input_tokens_seen": 21483208, + "step": 32925 + }, + { + "epoch": 19.41627358490566, + "grad_norm": 2.758554697036743, + "learning_rate": 2.5978491014674866e-08, + "loss": 0.1813, + "num_input_tokens_seen": 21486504, + "step": 32930 + }, + { + "epoch": 19.41922169811321, + "grad_norm": 3.8388237953186035, + "learning_rate": 2.571723250496705e-08, + "loss": 0.2385, + "num_input_tokens_seen": 21490024, + "step": 32935 + }, + { + "epoch": 19.422169811320753, + "grad_norm": 3.5145936012268066, + "learning_rate": 2.5457290958059155e-08, + "loss": 0.2529, + "num_input_tokens_seen": 21493704, + "step": 32940 + }, + { + "epoch": 19.4251179245283, + "grad_norm": 4.431590557098389, + "learning_rate": 2.51986664427728e-08, + "loss": 0.2939, + "num_input_tokens_seen": 21497448, + "step": 32945 + }, + { + "epoch": 19.42806603773585, + "grad_norm": 9.322731018066406, + "learning_rate": 2.4941359027579883e-08, + "loss": 0.474, + "num_input_tokens_seen": 21500520, + "step": 32950 + }, + { + "epoch": 19.431014150943398, + "grad_norm": 11.131851196289062, + "learning_rate": 2.468536878060368e-08, + "loss": 0.2568, + "num_input_tokens_seen": 21503784, + "step": 32955 + }, + { + "epoch": 19.433962264150942, + "grad_norm": 6.309096813201904, + "learning_rate": 2.443069576961832e-08, + "loss": 0.3477, + "num_input_tokens_seen": 21507048, + "step": 32960 + }, + { + "epoch": 19.43691037735849, + "grad_norm": 3.2915539741516113, + "learning_rate": 2.4177340062049304e-08, + "loss": 0.265, + "num_input_tokens_seen": 21509512, + "step": 32965 + }, + { + "epoch": 19.43985849056604, + "grad_norm": 4.698244094848633, + "learning_rate": 2.3925301724974647e-08, + "loss": 0.3565, + "num_input_tokens_seen": 21512232, + "step": 32970 + }, + { + "epoch": 19.442806603773583, + "grad_norm": 6.325228214263916, + "learning_rate": 2.367458082512153e-08, + "loss": 0.2231, + "num_input_tokens_seen": 21515464, + "step": 32975 + }, + { + "epoch": 19.44575471698113, + "grad_norm": 4.202943801879883, + "learning_rate": 2.3425177428870737e-08, + "loss": 0.2759, + "num_input_tokens_seen": 21518312, + "step": 32980 + }, + { + "epoch": 19.44870283018868, + "grad_norm": 5.13239860534668, + "learning_rate": 2.3177091602251677e-08, + "loss": 0.2931, + "num_input_tokens_seen": 21521576, + "step": 32985 + }, + { + "epoch": 19.451650943396228, + "grad_norm": 4.273693084716797, + "learning_rate": 2.2930323410946254e-08, + "loss": 0.2695, + "num_input_tokens_seen": 21525640, + "step": 32990 + }, + { + "epoch": 19.454599056603772, + "grad_norm": 3.2863235473632812, + "learning_rate": 2.2684872920287758e-08, + "loss": 0.3224, + "num_input_tokens_seen": 21529608, + "step": 32995 + }, + { + "epoch": 19.45754716981132, + "grad_norm": 4.616123676300049, + "learning_rate": 2.2440740195260323e-08, + "loss": 0.3279, + "num_input_tokens_seen": 21532840, + "step": 33000 + }, + { + "epoch": 19.46049528301887, + "grad_norm": 5.251607894897461, + "learning_rate": 2.219792530049891e-08, + "loss": 0.4382, + "num_input_tokens_seen": 21538696, + "step": 33005 + }, + { + "epoch": 19.463443396226417, + "grad_norm": 3.545989513397217, + "learning_rate": 2.1956428300290434e-08, + "loss": 0.2382, + "num_input_tokens_seen": 21542216, + "step": 33010 + }, + { + "epoch": 19.46639150943396, + "grad_norm": 4.449833869934082, + "learning_rate": 2.1716249258570966e-08, + "loss": 0.3048, + "num_input_tokens_seen": 21544808, + "step": 33015 + }, + { + "epoch": 19.46933962264151, + "grad_norm": 3.825828790664673, + "learning_rate": 2.1477388238930196e-08, + "loss": 0.2526, + "num_input_tokens_seen": 21548552, + "step": 33020 + }, + { + "epoch": 19.472287735849058, + "grad_norm": 6.2495317459106445, + "learning_rate": 2.1239845304606988e-08, + "loss": 0.3218, + "num_input_tokens_seen": 21551272, + "step": 33025 + }, + { + "epoch": 19.475235849056602, + "grad_norm": 4.540937900543213, + "learning_rate": 2.100362051849214e-08, + "loss": 0.3461, + "num_input_tokens_seen": 21554952, + "step": 33030 + }, + { + "epoch": 19.47818396226415, + "grad_norm": 3.5415186882019043, + "learning_rate": 2.076871394312674e-08, + "loss": 0.2865, + "num_input_tokens_seen": 21558952, + "step": 33035 + }, + { + "epoch": 19.4811320754717, + "grad_norm": 3.3420169353485107, + "learning_rate": 2.0535125640703813e-08, + "loss": 0.3412, + "num_input_tokens_seen": 21561800, + "step": 33040 + }, + { + "epoch": 19.484080188679247, + "grad_norm": 3.171975612640381, + "learning_rate": 2.0302855673066667e-08, + "loss": 0.2964, + "num_input_tokens_seen": 21564616, + "step": 33045 + }, + { + "epoch": 19.48702830188679, + "grad_norm": 4.28542423248291, + "learning_rate": 2.0071904101710004e-08, + "loss": 0.271, + "num_input_tokens_seen": 21567240, + "step": 33050 + }, + { + "epoch": 19.48997641509434, + "grad_norm": 6.73660945892334, + "learning_rate": 1.98422709877788e-08, + "loss": 0.4516, + "num_input_tokens_seen": 21569896, + "step": 33055 + }, + { + "epoch": 19.492924528301888, + "grad_norm": 11.652873039245605, + "learning_rate": 1.961395639206942e-08, + "loss": 0.3562, + "num_input_tokens_seen": 21573064, + "step": 33060 + }, + { + "epoch": 19.495872641509433, + "grad_norm": 3.7151145935058594, + "learning_rate": 1.9386960375029628e-08, + "loss": 0.201, + "num_input_tokens_seen": 21576232, + "step": 33065 + }, + { + "epoch": 19.49882075471698, + "grad_norm": 3.0851712226867676, + "learning_rate": 1.9161282996757458e-08, + "loss": 0.3357, + "num_input_tokens_seen": 21579208, + "step": 33070 + }, + { + "epoch": 19.50176886792453, + "grad_norm": 2.995922803878784, + "learning_rate": 1.8936924317001225e-08, + "loss": 0.3204, + "num_input_tokens_seen": 21582504, + "step": 33075 + }, + { + "epoch": 19.504716981132077, + "grad_norm": 1.9775701761245728, + "learning_rate": 1.8713884395162308e-08, + "loss": 0.179, + "num_input_tokens_seen": 21585416, + "step": 33080 + }, + { + "epoch": 19.50766509433962, + "grad_norm": 5.540274620056152, + "learning_rate": 1.8492163290290132e-08, + "loss": 0.4274, + "num_input_tokens_seen": 21588488, + "step": 33085 + }, + { + "epoch": 19.51061320754717, + "grad_norm": 3.431436538696289, + "learning_rate": 1.827176106108719e-08, + "loss": 0.3146, + "num_input_tokens_seen": 21591592, + "step": 33090 + }, + { + "epoch": 19.513561320754718, + "grad_norm": 3.6508395671844482, + "learning_rate": 1.8052677765905137e-08, + "loss": 0.3905, + "num_input_tokens_seen": 21595432, + "step": 33095 + }, + { + "epoch": 19.516509433962263, + "grad_norm": 5.253223896026611, + "learning_rate": 1.783491346274757e-08, + "loss": 0.3075, + "num_input_tokens_seen": 21597864, + "step": 33100 + }, + { + "epoch": 19.51945754716981, + "grad_norm": 4.272646427154541, + "learning_rate": 1.7618468209268936e-08, + "loss": 0.2955, + "num_input_tokens_seen": 21600712, + "step": 33105 + }, + { + "epoch": 19.52240566037736, + "grad_norm": 1.939947247505188, + "learning_rate": 1.7403342062773943e-08, + "loss": 0.2128, + "num_input_tokens_seen": 21606216, + "step": 33110 + }, + { + "epoch": 19.525353773584907, + "grad_norm": 5.6585822105407715, + "learning_rate": 1.718953508021759e-08, + "loss": 0.3772, + "num_input_tokens_seen": 21609768, + "step": 33115 + }, + { + "epoch": 19.528301886792452, + "grad_norm": 2.951385259628296, + "learning_rate": 1.6977047318206262e-08, + "loss": 0.4107, + "num_input_tokens_seen": 21612200, + "step": 33120 + }, + { + "epoch": 19.53125, + "grad_norm": 2.8376553058624268, + "learning_rate": 1.676587883299774e-08, + "loss": 0.3996, + "num_input_tokens_seen": 21615752, + "step": 33125 + }, + { + "epoch": 19.534198113207548, + "grad_norm": 3.900975227355957, + "learning_rate": 1.655602968049952e-08, + "loss": 0.2302, + "num_input_tokens_seen": 21618888, + "step": 33130 + }, + { + "epoch": 19.537146226415093, + "grad_norm": 3.894360065460205, + "learning_rate": 1.634749991626938e-08, + "loss": 0.3375, + "num_input_tokens_seen": 21621352, + "step": 33135 + }, + { + "epoch": 19.54009433962264, + "grad_norm": 3.01920223236084, + "learning_rate": 1.6140289595517056e-08, + "loss": 0.3306, + "num_input_tokens_seen": 21625960, + "step": 33140 + }, + { + "epoch": 19.54304245283019, + "grad_norm": 3.068626880645752, + "learning_rate": 1.5934398773102545e-08, + "loss": 0.3775, + "num_input_tokens_seen": 21629480, + "step": 33145 + }, + { + "epoch": 19.545990566037737, + "grad_norm": 4.055754661560059, + "learning_rate": 1.5729827503536133e-08, + "loss": 0.325, + "num_input_tokens_seen": 21633160, + "step": 33150 + }, + { + "epoch": 19.548938679245282, + "grad_norm": 3.890103340148926, + "learning_rate": 1.5526575840978942e-08, + "loss": 0.2864, + "num_input_tokens_seen": 21635752, + "step": 33155 + }, + { + "epoch": 19.55188679245283, + "grad_norm": 2.6376302242279053, + "learning_rate": 1.532464383924237e-08, + "loss": 0.1994, + "num_input_tokens_seen": 21638792, + "step": 33160 + }, + { + "epoch": 19.55483490566038, + "grad_norm": 3.577603816986084, + "learning_rate": 1.5124031551789208e-08, + "loss": 0.3359, + "num_input_tokens_seen": 21641672, + "step": 33165 + }, + { + "epoch": 19.557783018867923, + "grad_norm": 4.259702205657959, + "learning_rate": 1.4924739031732527e-08, + "loss": 0.3339, + "num_input_tokens_seen": 21644232, + "step": 33170 + }, + { + "epoch": 19.56073113207547, + "grad_norm": 4.122569561004639, + "learning_rate": 1.4726766331835118e-08, + "loss": 0.3654, + "num_input_tokens_seen": 21647624, + "step": 33175 + }, + { + "epoch": 19.56367924528302, + "grad_norm": 3.83862566947937, + "learning_rate": 1.4530113504512278e-08, + "loss": 0.2071, + "num_input_tokens_seen": 21650152, + "step": 33180 + }, + { + "epoch": 19.566627358490567, + "grad_norm": 4.257011890411377, + "learning_rate": 1.4334780601827914e-08, + "loss": 0.3733, + "num_input_tokens_seen": 21653608, + "step": 33185 + }, + { + "epoch": 19.569575471698112, + "grad_norm": 4.108937740325928, + "learning_rate": 1.4140767675497325e-08, + "loss": 0.3154, + "num_input_tokens_seen": 21658120, + "step": 33190 + }, + { + "epoch": 19.57252358490566, + "grad_norm": 1.9480382204055786, + "learning_rate": 1.394807477688609e-08, + "loss": 0.1911, + "num_input_tokens_seen": 21661480, + "step": 33195 + }, + { + "epoch": 19.57547169811321, + "grad_norm": 4.602447509765625, + "learning_rate": 1.3756701957011177e-08, + "loss": 0.2926, + "num_input_tokens_seen": 21664456, + "step": 33200 + }, + { + "epoch": 19.578419811320753, + "grad_norm": 4.1927170753479, + "learning_rate": 1.3566649266538723e-08, + "loss": 0.2349, + "num_input_tokens_seen": 21667592, + "step": 33205 + }, + { + "epoch": 19.5813679245283, + "grad_norm": 2.297913074493408, + "learning_rate": 1.3377916755786257e-08, + "loss": 0.3306, + "num_input_tokens_seen": 21670536, + "step": 33210 + }, + { + "epoch": 19.58431603773585, + "grad_norm": 4.387843132019043, + "learning_rate": 1.3190504474721588e-08, + "loss": 0.2972, + "num_input_tokens_seen": 21673256, + "step": 33215 + }, + { + "epoch": 19.587264150943398, + "grad_norm": 3.084247589111328, + "learning_rate": 1.3004412472962802e-08, + "loss": 0.2841, + "num_input_tokens_seen": 21676424, + "step": 33220 + }, + { + "epoch": 19.590212264150942, + "grad_norm": 6.051823139190674, + "learning_rate": 1.2819640799778266e-08, + "loss": 0.2338, + "num_input_tokens_seen": 21679528, + "step": 33225 + }, + { + "epoch": 19.59316037735849, + "grad_norm": 4.355128288269043, + "learning_rate": 1.2636189504087737e-08, + "loss": 0.3074, + "num_input_tokens_seen": 21682088, + "step": 33230 + }, + { + "epoch": 19.59610849056604, + "grad_norm": 3.2457728385925293, + "learning_rate": 1.2454058634460142e-08, + "loss": 0.281, + "num_input_tokens_seen": 21685192, + "step": 33235 + }, + { + "epoch": 19.599056603773583, + "grad_norm": 5.141165733337402, + "learning_rate": 1.2273248239115798e-08, + "loss": 0.3018, + "num_input_tokens_seen": 21687912, + "step": 33240 + }, + { + "epoch": 19.60200471698113, + "grad_norm": 2.468722105026245, + "learning_rate": 1.2093758365924746e-08, + "loss": 0.2699, + "num_input_tokens_seen": 21691560, + "step": 33245 + }, + { + "epoch": 19.60495283018868, + "grad_norm": 2.7395548820495605, + "learning_rate": 1.1915589062408417e-08, + "loss": 0.3169, + "num_input_tokens_seen": 21694696, + "step": 33250 + }, + { + "epoch": 19.607900943396228, + "grad_norm": 3.670421600341797, + "learning_rate": 1.1738740375736301e-08, + "loss": 0.2779, + "num_input_tokens_seen": 21698344, + "step": 33255 + }, + { + "epoch": 19.610849056603772, + "grad_norm": 6.9125823974609375, + "learning_rate": 1.15632123527315e-08, + "loss": 0.2563, + "num_input_tokens_seen": 21702024, + "step": 33260 + }, + { + "epoch": 19.61379716981132, + "grad_norm": 4.731331825256348, + "learning_rate": 1.1389005039865176e-08, + "loss": 0.2604, + "num_input_tokens_seen": 21705736, + "step": 33265 + }, + { + "epoch": 19.61674528301887, + "grad_norm": 3.6765620708465576, + "learning_rate": 1.1216118483259875e-08, + "loss": 0.3332, + "num_input_tokens_seen": 21708808, + "step": 33270 + }, + { + "epoch": 19.619693396226417, + "grad_norm": 4.913275241851807, + "learning_rate": 1.1044552728687319e-08, + "loss": 0.3127, + "num_input_tokens_seen": 21711112, + "step": 33275 + }, + { + "epoch": 19.62264150943396, + "grad_norm": 3.6895456314086914, + "learning_rate": 1.0874307821570618e-08, + "loss": 0.3657, + "num_input_tokens_seen": 21714600, + "step": 33280 + }, + { + "epoch": 19.62558962264151, + "grad_norm": 4.182900428771973, + "learning_rate": 1.0705383806982606e-08, + "loss": 0.2756, + "num_input_tokens_seen": 21717992, + "step": 33285 + }, + { + "epoch": 19.628537735849058, + "grad_norm": 4.212728977203369, + "learning_rate": 1.0537780729646952e-08, + "loss": 0.1905, + "num_input_tokens_seen": 21720392, + "step": 33290 + }, + { + "epoch": 19.631485849056602, + "grad_norm": 4.146754264831543, + "learning_rate": 1.0371498633937605e-08, + "loss": 0.357, + "num_input_tokens_seen": 21723528, + "step": 33295 + }, + { + "epoch": 19.63443396226415, + "grad_norm": 3.1314327716827393, + "learning_rate": 1.0206537563877683e-08, + "loss": 0.2156, + "num_input_tokens_seen": 21726600, + "step": 33300 + }, + { + "epoch": 19.6373820754717, + "grad_norm": 3.277869462966919, + "learning_rate": 1.0042897563141695e-08, + "loss": 0.2274, + "num_input_tokens_seen": 21729608, + "step": 33305 + }, + { + "epoch": 19.640330188679247, + "grad_norm": 2.9126152992248535, + "learning_rate": 9.88057867505443e-09, + "loss": 0.3439, + "num_input_tokens_seen": 21732904, + "step": 33310 + }, + { + "epoch": 19.64327830188679, + "grad_norm": 3.525317668914795, + "learning_rate": 9.7195809425904e-09, + "loss": 0.4144, + "num_input_tokens_seen": 21735496, + "step": 33315 + }, + { + "epoch": 19.64622641509434, + "grad_norm": 5.068065643310547, + "learning_rate": 9.559904408373844e-09, + "loss": 0.2932, + "num_input_tokens_seen": 21738024, + "step": 33320 + }, + { + "epoch": 19.649174528301888, + "grad_norm": 2.2403738498687744, + "learning_rate": 9.401549114680387e-09, + "loss": 0.2104, + "num_input_tokens_seen": 21742280, + "step": 33325 + }, + { + "epoch": 19.652122641509433, + "grad_norm": 4.431077480316162, + "learning_rate": 9.244515103434826e-09, + "loss": 0.2711, + "num_input_tokens_seen": 21745416, + "step": 33330 + }, + { + "epoch": 19.65507075471698, + "grad_norm": 4.923156261444092, + "learning_rate": 9.08880241621335e-09, + "loss": 0.2808, + "num_input_tokens_seen": 21750344, + "step": 33335 + }, + { + "epoch": 19.65801886792453, + "grad_norm": 3.161891222000122, + "learning_rate": 8.934411094240758e-09, + "loss": 0.3335, + "num_input_tokens_seen": 21754152, + "step": 33340 + }, + { + "epoch": 19.660966981132077, + "grad_norm": 3.94423770904541, + "learning_rate": 8.781341178393244e-09, + "loss": 0.3372, + "num_input_tokens_seen": 21757800, + "step": 33345 + }, + { + "epoch": 19.66391509433962, + "grad_norm": 3.609478235244751, + "learning_rate": 8.629592709196167e-09, + "loss": 0.3472, + "num_input_tokens_seen": 21760808, + "step": 33350 + }, + { + "epoch": 19.66686320754717, + "grad_norm": 3.5832736492156982, + "learning_rate": 8.479165726826277e-09, + "loss": 0.2646, + "num_input_tokens_seen": 21763848, + "step": 33355 + }, + { + "epoch": 19.669811320754718, + "grad_norm": 3.6949477195739746, + "learning_rate": 8.330060271109496e-09, + "loss": 0.3371, + "num_input_tokens_seen": 21766856, + "step": 33360 + }, + { + "epoch": 19.672759433962263, + "grad_norm": 2.791074514389038, + "learning_rate": 8.18227638152258e-09, + "loss": 0.2962, + "num_input_tokens_seen": 21770088, + "step": 33365 + }, + { + "epoch": 19.67570754716981, + "grad_norm": 4.624428749084473, + "learning_rate": 8.035814097191452e-09, + "loss": 0.3431, + "num_input_tokens_seen": 21773128, + "step": 33370 + }, + { + "epoch": 19.67865566037736, + "grad_norm": 5.59794282913208, + "learning_rate": 7.890673456892317e-09, + "loss": 0.294, + "num_input_tokens_seen": 21776296, + "step": 33375 + }, + { + "epoch": 19.681603773584907, + "grad_norm": 4.726974010467529, + "learning_rate": 7.746854499052215e-09, + "loss": 0.2832, + "num_input_tokens_seen": 21779176, + "step": 33380 + }, + { + "epoch": 19.684551886792452, + "grad_norm": 4.822123050689697, + "learning_rate": 7.604357261747907e-09, + "loss": 0.4126, + "num_input_tokens_seen": 21782632, + "step": 33385 + }, + { + "epoch": 19.6875, + "grad_norm": 3.692781686782837, + "learning_rate": 7.463181782705886e-09, + "loss": 0.3576, + "num_input_tokens_seen": 21785512, + "step": 33390 + }, + { + "epoch": 19.690448113207548, + "grad_norm": 2.5595858097076416, + "learning_rate": 7.3233280993034726e-09, + "loss": 0.3427, + "num_input_tokens_seen": 21789128, + "step": 33395 + }, + { + "epoch": 19.693396226415093, + "grad_norm": 3.8874311447143555, + "learning_rate": 7.184796248567161e-09, + "loss": 0.3668, + "num_input_tokens_seen": 21792168, + "step": 33400 + }, + { + "epoch": 19.69634433962264, + "grad_norm": 4.511477470397949, + "learning_rate": 7.047586267173723e-09, + "loss": 0.2823, + "num_input_tokens_seen": 21794888, + "step": 33405 + }, + { + "epoch": 19.69929245283019, + "grad_norm": 3.34980845451355, + "learning_rate": 6.9116981914502114e-09, + "loss": 0.301, + "num_input_tokens_seen": 21798152, + "step": 33410 + }, + { + "epoch": 19.702240566037737, + "grad_norm": 12.337843894958496, + "learning_rate": 6.7771320573734036e-09, + "loss": 0.4092, + "num_input_tokens_seen": 21801000, + "step": 33415 + }, + { + "epoch": 19.705188679245282, + "grad_norm": 8.585694313049316, + "learning_rate": 6.6438879005709114e-09, + "loss": 0.2337, + "num_input_tokens_seen": 21804392, + "step": 33420 + }, + { + "epoch": 19.70813679245283, + "grad_norm": 6.219948768615723, + "learning_rate": 6.511965756318961e-09, + "loss": 0.398, + "num_input_tokens_seen": 21807304, + "step": 33425 + }, + { + "epoch": 19.71108490566038, + "grad_norm": 2.3407206535339355, + "learning_rate": 6.381365659545169e-09, + "loss": 0.2455, + "num_input_tokens_seen": 21812008, + "step": 33430 + }, + { + "epoch": 19.714033018867923, + "grad_norm": 5.603320598602295, + "learning_rate": 6.252087644825766e-09, + "loss": 0.2908, + "num_input_tokens_seen": 21814696, + "step": 33435 + }, + { + "epoch": 19.71698113207547, + "grad_norm": 4.824310302734375, + "learning_rate": 6.124131746388373e-09, + "loss": 0.2589, + "num_input_tokens_seen": 21818344, + "step": 33440 + }, + { + "epoch": 19.71992924528302, + "grad_norm": 3.7015762329101562, + "learning_rate": 5.997497998109225e-09, + "loss": 0.2833, + "num_input_tokens_seen": 21821480, + "step": 33445 + }, + { + "epoch": 19.722877358490567, + "grad_norm": 4.04315710067749, + "learning_rate": 5.8721864335153925e-09, + "loss": 0.2268, + "num_input_tokens_seen": 21827240, + "step": 33450 + }, + { + "epoch": 19.725825471698112, + "grad_norm": 3.7808432579040527, + "learning_rate": 5.748197085784224e-09, + "loss": 0.1984, + "num_input_tokens_seen": 21830440, + "step": 33455 + }, + { + "epoch": 19.72877358490566, + "grad_norm": 2.934755325317383, + "learning_rate": 5.62552998774113e-09, + "loss": 0.3141, + "num_input_tokens_seen": 21833320, + "step": 33460 + }, + { + "epoch": 19.73172169811321, + "grad_norm": 3.226236343383789, + "learning_rate": 5.504185171864018e-09, + "loss": 0.337, + "num_input_tokens_seen": 21837352, + "step": 33465 + }, + { + "epoch": 19.734669811320753, + "grad_norm": 5.003249168395996, + "learning_rate": 5.384162670278858e-09, + "loss": 0.3542, + "num_input_tokens_seen": 21840104, + "step": 33470 + }, + { + "epoch": 19.7376179245283, + "grad_norm": 3.75529146194458, + "learning_rate": 5.265462514762454e-09, + "loss": 0.279, + "num_input_tokens_seen": 21842856, + "step": 33475 + }, + { + "epoch": 19.74056603773585, + "grad_norm": 4.079929828643799, + "learning_rate": 5.148084736740777e-09, + "loss": 0.3237, + "num_input_tokens_seen": 21845992, + "step": 33480 + }, + { + "epoch": 19.743514150943398, + "grad_norm": 2.646021604537964, + "learning_rate": 5.032029367290081e-09, + "loss": 0.311, + "num_input_tokens_seen": 21849576, + "step": 33485 + }, + { + "epoch": 19.746462264150942, + "grad_norm": 2.7977280616760254, + "learning_rate": 4.917296437136898e-09, + "loss": 0.2966, + "num_input_tokens_seen": 21852808, + "step": 33490 + }, + { + "epoch": 19.74941037735849, + "grad_norm": 3.1355819702148438, + "learning_rate": 4.8038859766569305e-09, + "loss": 0.3706, + "num_input_tokens_seen": 21855720, + "step": 33495 + }, + { + "epoch": 19.75235849056604, + "grad_norm": 4.428485870361328, + "learning_rate": 4.691798015876714e-09, + "loss": 0.3162, + "num_input_tokens_seen": 21858248, + "step": 33500 + }, + { + "epoch": 19.755306603773583, + "grad_norm": 2.8598434925079346, + "learning_rate": 4.581032584470846e-09, + "loss": 0.3158, + "num_input_tokens_seen": 21861928, + "step": 33505 + }, + { + "epoch": 19.75825471698113, + "grad_norm": 0.6737237572669983, + "learning_rate": 4.471589711766422e-09, + "loss": 0.2533, + "num_input_tokens_seen": 21868200, + "step": 33510 + }, + { + "epoch": 19.76120283018868, + "grad_norm": 6.102258205413818, + "learning_rate": 4.363469426737487e-09, + "loss": 0.31, + "num_input_tokens_seen": 21871336, + "step": 33515 + }, + { + "epoch": 19.764150943396228, + "grad_norm": 3.0342588424682617, + "learning_rate": 4.256671758010588e-09, + "loss": 0.2765, + "num_input_tokens_seen": 21874088, + "step": 33520 + }, + { + "epoch": 19.767099056603772, + "grad_norm": 3.785327434539795, + "learning_rate": 4.151196733859775e-09, + "loss": 0.2339, + "num_input_tokens_seen": 21876936, + "step": 33525 + }, + { + "epoch": 19.77004716981132, + "grad_norm": 2.918762445449829, + "learning_rate": 4.047044382211041e-09, + "loss": 0.2932, + "num_input_tokens_seen": 21880104, + "step": 33530 + }, + { + "epoch": 19.77299528301887, + "grad_norm": 1.7151894569396973, + "learning_rate": 3.94421473063844e-09, + "loss": 0.3409, + "num_input_tokens_seen": 21883528, + "step": 33535 + }, + { + "epoch": 19.775943396226417, + "grad_norm": 3.7225520610809326, + "learning_rate": 3.842707806366863e-09, + "loss": 0.5135, + "num_input_tokens_seen": 21887752, + "step": 33540 + }, + { + "epoch": 19.77889150943396, + "grad_norm": 10.884625434875488, + "learning_rate": 3.742523636270368e-09, + "loss": 0.4784, + "num_input_tokens_seen": 21891240, + "step": 33545 + }, + { + "epoch": 19.78183962264151, + "grad_norm": 6.8832783699035645, + "learning_rate": 3.6436622468738468e-09, + "loss": 0.3121, + "num_input_tokens_seen": 21893864, + "step": 33550 + }, + { + "epoch": 19.784787735849058, + "grad_norm": 3.7381439208984375, + "learning_rate": 3.546123664350254e-09, + "loss": 0.3719, + "num_input_tokens_seen": 21896904, + "step": 33555 + }, + { + "epoch": 19.787735849056602, + "grad_norm": 3.070403575897217, + "learning_rate": 3.449907914524486e-09, + "loss": 0.347, + "num_input_tokens_seen": 21899400, + "step": 33560 + }, + { + "epoch": 19.79068396226415, + "grad_norm": 3.017155408859253, + "learning_rate": 3.355015022869501e-09, + "loss": 0.2557, + "num_input_tokens_seen": 21902408, + "step": 33565 + }, + { + "epoch": 19.7936320754717, + "grad_norm": 3.618273973464966, + "learning_rate": 3.261445014508535e-09, + "loss": 0.3006, + "num_input_tokens_seen": 21905992, + "step": 33570 + }, + { + "epoch": 19.796580188679247, + "grad_norm": 3.5126900672912598, + "learning_rate": 3.1691979142145503e-09, + "loss": 0.3445, + "num_input_tokens_seen": 21910920, + "step": 33575 + }, + { + "epoch": 19.79952830188679, + "grad_norm": 2.4283151626586914, + "learning_rate": 3.078273746410787e-09, + "loss": 0.2323, + "num_input_tokens_seen": 21914984, + "step": 33580 + }, + { + "epoch": 19.80247641509434, + "grad_norm": 3.705442190170288, + "learning_rate": 2.988672535169657e-09, + "loss": 0.3341, + "num_input_tokens_seen": 21917800, + "step": 33585 + }, + { + "epoch": 19.805424528301888, + "grad_norm": 1.9761531352996826, + "learning_rate": 2.9003943042127393e-09, + "loss": 0.2924, + "num_input_tokens_seen": 21920680, + "step": 33590 + }, + { + "epoch": 19.808372641509433, + "grad_norm": 3.415003776550293, + "learning_rate": 2.8134390769135598e-09, + "loss": 0.3244, + "num_input_tokens_seen": 21923880, + "step": 33595 + }, + { + "epoch": 19.81132075471698, + "grad_norm": 5.928948879241943, + "learning_rate": 2.7278068762925935e-09, + "loss": 0.2789, + "num_input_tokens_seen": 21926632, + "step": 33600 + }, + { + "epoch": 19.81426886792453, + "grad_norm": 3.166869640350342, + "learning_rate": 2.6434977250217043e-09, + "loss": 0.3285, + "num_input_tokens_seen": 21929544, + "step": 33605 + }, + { + "epoch": 19.817216981132077, + "grad_norm": 4.835379600524902, + "learning_rate": 2.560511645422481e-09, + "loss": 0.2893, + "num_input_tokens_seen": 21932424, + "step": 33610 + }, + { + "epoch": 19.82016509433962, + "grad_norm": 3.599423408508301, + "learning_rate": 2.4788486594656825e-09, + "loss": 0.3389, + "num_input_tokens_seen": 21935848, + "step": 33615 + }, + { + "epoch": 19.82311320754717, + "grad_norm": 2.914738893508911, + "learning_rate": 2.398508788771792e-09, + "loss": 0.4151, + "num_input_tokens_seen": 21939240, + "step": 33620 + }, + { + "epoch": 19.826061320754718, + "grad_norm": 3.976651668548584, + "learning_rate": 2.3194920546110166e-09, + "loss": 0.2993, + "num_input_tokens_seen": 21942312, + "step": 33625 + }, + { + "epoch": 19.829009433962263, + "grad_norm": 7.003410816192627, + "learning_rate": 2.2417984779032896e-09, + "loss": 0.3233, + "num_input_tokens_seen": 21945800, + "step": 33630 + }, + { + "epoch": 19.83195754716981, + "grad_norm": 2.878335952758789, + "learning_rate": 2.1654280792193782e-09, + "loss": 0.3665, + "num_input_tokens_seen": 21949544, + "step": 33635 + }, + { + "epoch": 19.83490566037736, + "grad_norm": 4.359111309051514, + "learning_rate": 2.0903808787769987e-09, + "loss": 0.3524, + "num_input_tokens_seen": 21952808, + "step": 33640 + }, + { + "epoch": 19.837853773584907, + "grad_norm": 3.0404725074768066, + "learning_rate": 2.0166568964463673e-09, + "loss": 0.2926, + "num_input_tokens_seen": 21960040, + "step": 33645 + }, + { + "epoch": 19.840801886792452, + "grad_norm": 3.753020763397217, + "learning_rate": 1.9442561517463153e-09, + "loss": 0.2284, + "num_input_tokens_seen": 21963784, + "step": 33650 + }, + { + "epoch": 19.84375, + "grad_norm": 3.1365296840667725, + "learning_rate": 1.8731786638442886e-09, + "loss": 0.335, + "num_input_tokens_seen": 21967144, + "step": 33655 + }, + { + "epoch": 19.846698113207548, + "grad_norm": 3.0995407104492188, + "learning_rate": 1.8034244515591214e-09, + "loss": 0.3539, + "num_input_tokens_seen": 21970248, + "step": 33660 + }, + { + "epoch": 19.849646226415093, + "grad_norm": 4.749851226806641, + "learning_rate": 1.7349935333582646e-09, + "loss": 0.2825, + "num_input_tokens_seen": 21973448, + "step": 33665 + }, + { + "epoch": 19.85259433962264, + "grad_norm": 5.330183982849121, + "learning_rate": 1.6678859273594471e-09, + "loss": 0.2435, + "num_input_tokens_seen": 21976360, + "step": 33670 + }, + { + "epoch": 19.85554245283019, + "grad_norm": 4.068929195404053, + "learning_rate": 1.6021016513295683e-09, + "loss": 0.301, + "num_input_tokens_seen": 21979208, + "step": 33675 + }, + { + "epoch": 19.858490566037737, + "grad_norm": 3.971526861190796, + "learning_rate": 1.5376407226846968e-09, + "loss": 0.3385, + "num_input_tokens_seen": 21981672, + "step": 33680 + }, + { + "epoch": 19.861438679245282, + "grad_norm": 3.8497467041015625, + "learning_rate": 1.4745031584917357e-09, + "loss": 0.2773, + "num_input_tokens_seen": 21983944, + "step": 33685 + }, + { + "epoch": 19.86438679245283, + "grad_norm": 4.032742500305176, + "learning_rate": 1.4126889754667583e-09, + "loss": 0.2815, + "num_input_tokens_seen": 21987208, + "step": 33690 + }, + { + "epoch": 19.86733490566038, + "grad_norm": 4.820897579193115, + "learning_rate": 1.3521981899750069e-09, + "loss": 0.3957, + "num_input_tokens_seen": 21991912, + "step": 33695 + }, + { + "epoch": 19.870283018867923, + "grad_norm": 2.5630223751068115, + "learning_rate": 1.293030818032004e-09, + "loss": 0.3726, + "num_input_tokens_seen": 21995976, + "step": 33700 + }, + { + "epoch": 19.87323113207547, + "grad_norm": 3.7156996726989746, + "learning_rate": 1.2351868753018858e-09, + "loss": 0.3954, + "num_input_tokens_seen": 21999144, + "step": 33705 + }, + { + "epoch": 19.87617924528302, + "grad_norm": 2.6448280811309814, + "learning_rate": 1.1786663770996242e-09, + "loss": 0.189, + "num_input_tokens_seen": 22002216, + "step": 33710 + }, + { + "epoch": 19.879127358490567, + "grad_norm": 2.757309675216675, + "learning_rate": 1.1234693383893602e-09, + "loss": 0.3457, + "num_input_tokens_seen": 22007016, + "step": 33715 + }, + { + "epoch": 19.882075471698112, + "grad_norm": 4.485562801361084, + "learning_rate": 1.0695957737844043e-09, + "loss": 0.3566, + "num_input_tokens_seen": 22010440, + "step": 33720 + }, + { + "epoch": 19.88502358490566, + "grad_norm": 3.5489165782928467, + "learning_rate": 1.0170456975483467e-09, + "loss": 0.2645, + "num_input_tokens_seen": 22014088, + "step": 33725 + }, + { + "epoch": 19.88797169811321, + "grad_norm": 3.215893030166626, + "learning_rate": 9.658191235933922e-10, + "loss": 0.3112, + "num_input_tokens_seen": 22017352, + "step": 33730 + }, + { + "epoch": 19.890919811320753, + "grad_norm": 2.998401641845703, + "learning_rate": 9.159160654825805e-10, + "loss": 0.295, + "num_input_tokens_seen": 22021192, + "step": 33735 + }, + { + "epoch": 19.8938679245283, + "grad_norm": 7.281124114990234, + "learning_rate": 8.673365364281205e-10, + "loss": 0.2125, + "num_input_tokens_seen": 22023784, + "step": 33740 + }, + { + "epoch": 19.89681603773585, + "grad_norm": 2.0302274227142334, + "learning_rate": 8.200805492913911e-10, + "loss": 0.288, + "num_input_tokens_seen": 22027176, + "step": 33745 + }, + { + "epoch": 19.899764150943398, + "grad_norm": 5.196775436401367, + "learning_rate": 7.741481165834952e-10, + "loss": 0.3316, + "num_input_tokens_seen": 22030248, + "step": 33750 + }, + { + "epoch": 19.902712264150942, + "grad_norm": 5.006543159484863, + "learning_rate": 7.29539250465261e-10, + "loss": 0.3139, + "num_input_tokens_seen": 22033384, + "step": 33755 + }, + { + "epoch": 19.90566037735849, + "grad_norm": 3.58197021484375, + "learning_rate": 6.862539627472409e-10, + "loss": 0.3287, + "num_input_tokens_seen": 22037128, + "step": 33760 + }, + { + "epoch": 19.90860849056604, + "grad_norm": 5.2429962158203125, + "learning_rate": 6.442922648897121e-10, + "loss": 0.35, + "num_input_tokens_seen": 22040264, + "step": 33765 + }, + { + "epoch": 19.911556603773583, + "grad_norm": 4.104638576507568, + "learning_rate": 6.036541680015662e-10, + "loss": 0.3984, + "num_input_tokens_seen": 22042472, + "step": 33770 + }, + { + "epoch": 19.91450471698113, + "grad_norm": 2.1150755882263184, + "learning_rate": 5.643396828419745e-10, + "loss": 0.3069, + "num_input_tokens_seen": 22046088, + "step": 33775 + }, + { + "epoch": 19.91745283018868, + "grad_norm": 3.7125000953674316, + "learning_rate": 5.26348819819833e-10, + "loss": 0.3025, + "num_input_tokens_seen": 22049192, + "step": 33780 + }, + { + "epoch": 19.920400943396228, + "grad_norm": 3.397738218307495, + "learning_rate": 4.896815889937622e-10, + "loss": 0.2736, + "num_input_tokens_seen": 22052776, + "step": 33785 + }, + { + "epoch": 19.923349056603772, + "grad_norm": 5.0815277099609375, + "learning_rate": 4.543380000704423e-10, + "loss": 0.3318, + "num_input_tokens_seen": 22055752, + "step": 33790 + }, + { + "epoch": 19.92629716981132, + "grad_norm": 5.774856090545654, + "learning_rate": 4.203180624084979e-10, + "loss": 0.2691, + "num_input_tokens_seen": 22059048, + "step": 33795 + }, + { + "epoch": 19.92924528301887, + "grad_norm": 4.3554606437683105, + "learning_rate": 3.876217850146136e-10, + "loss": 0.3122, + "num_input_tokens_seen": 22063176, + "step": 33800 + }, + { + "epoch": 19.932193396226417, + "grad_norm": 2.9344284534454346, + "learning_rate": 3.56249176544643e-10, + "loss": 0.3053, + "num_input_tokens_seen": 22066952, + "step": 33805 + }, + { + "epoch": 19.93514150943396, + "grad_norm": 2.6180758476257324, + "learning_rate": 3.262002453047197e-10, + "loss": 0.3078, + "num_input_tokens_seen": 22070312, + "step": 33810 + }, + { + "epoch": 19.93808962264151, + "grad_norm": 4.800815582275391, + "learning_rate": 2.974749992512571e-10, + "loss": 0.4239, + "num_input_tokens_seen": 22073608, + "step": 33815 + }, + { + "epoch": 19.941037735849058, + "grad_norm": 5.486624717712402, + "learning_rate": 2.700734459881726e-10, + "loss": 0.2968, + "num_input_tokens_seen": 22076264, + "step": 33820 + }, + { + "epoch": 19.943985849056602, + "grad_norm": 4.669373512268066, + "learning_rate": 2.439955927713289e-10, + "loss": 0.3224, + "num_input_tokens_seen": 22079624, + "step": 33825 + }, + { + "epoch": 19.94693396226415, + "grad_norm": 3.429659366607666, + "learning_rate": 2.1924144650409263e-10, + "loss": 0.3369, + "num_input_tokens_seen": 22084136, + "step": 33830 + }, + { + "epoch": 19.9498820754717, + "grad_norm": 4.6284871101379395, + "learning_rate": 1.9581101374066546e-10, + "loss": 0.3427, + "num_input_tokens_seen": 22086984, + "step": 33835 + }, + { + "epoch": 19.952830188679247, + "grad_norm": 3.6983838081359863, + "learning_rate": 1.7370430068441858e-10, + "loss": 0.3402, + "num_input_tokens_seen": 22090760, + "step": 33840 + }, + { + "epoch": 19.95577830188679, + "grad_norm": 2.2568986415863037, + "learning_rate": 1.529213131878926e-10, + "loss": 0.2809, + "num_input_tokens_seen": 22093544, + "step": 33845 + }, + { + "epoch": 19.95872641509434, + "grad_norm": 3.906728506088257, + "learning_rate": 1.3346205675335288e-10, + "loss": 0.2686, + "num_input_tokens_seen": 22097032, + "step": 33850 + }, + { + "epoch": 19.961674528301888, + "grad_norm": 3.609347343444824, + "learning_rate": 1.1532653653334447e-10, + "loss": 0.2918, + "num_input_tokens_seen": 22100904, + "step": 33855 + }, + { + "epoch": 19.964622641509433, + "grad_norm": 3.7154812812805176, + "learning_rate": 9.85147573284717e-11, + "loss": 0.2894, + "num_input_tokens_seen": 22104136, + "step": 33860 + }, + { + "epoch": 19.96757075471698, + "grad_norm": 3.174069404602051, + "learning_rate": 8.302672359072894e-11, + "loss": 0.3854, + "num_input_tokens_seen": 22107496, + "step": 33865 + }, + { + "epoch": 19.97051886792453, + "grad_norm": 4.42632532119751, + "learning_rate": 6.886243941961468e-11, + "loss": 0.339, + "num_input_tokens_seen": 22109960, + "step": 33870 + }, + { + "epoch": 19.973466981132077, + "grad_norm": 4.359560966491699, + "learning_rate": 5.602190856601741e-11, + "loss": 0.2939, + "num_input_tokens_seen": 22113384, + "step": 33875 + }, + { + "epoch": 19.97641509433962, + "grad_norm": 3.5203444957733154, + "learning_rate": 4.450513442888493e-11, + "loss": 0.2258, + "num_input_tokens_seen": 22116328, + "step": 33880 + }, + { + "epoch": 19.97936320754717, + "grad_norm": 5.701579570770264, + "learning_rate": 3.4312120057999886e-11, + "loss": 0.249, + "num_input_tokens_seen": 22119368, + "step": 33885 + }, + { + "epoch": 19.982311320754718, + "grad_norm": 4.176498889923096, + "learning_rate": 2.5442868151204224e-11, + "loss": 0.2913, + "num_input_tokens_seen": 22121960, + "step": 33890 + }, + { + "epoch": 19.985259433962263, + "grad_norm": 3.7649621963500977, + "learning_rate": 1.7897381057729867e-11, + "loss": 0.4277, + "num_input_tokens_seen": 22124904, + "step": 33895 + }, + { + "epoch": 19.98820754716981, + "grad_norm": 4.336522579193115, + "learning_rate": 1.1675660773757813e-11, + "loss": 0.2248, + "num_input_tokens_seen": 22127944, + "step": 33900 + }, + { + "epoch": 19.99115566037736, + "grad_norm": 2.063112258911133, + "learning_rate": 6.777708947969253e-12, + "loss": 0.2733, + "num_input_tokens_seen": 22130952, + "step": 33905 + }, + { + "epoch": 19.994103773584907, + "grad_norm": 5.496767520904541, + "learning_rate": 3.2035268765495674e-12, + "loss": 0.2144, + "num_input_tokens_seen": 22133416, + "step": 33910 + }, + { + "epoch": 19.997051886792452, + "grad_norm": 3.0440354347229004, + "learning_rate": 9.531155059638863e-13, + "loss": 0.3743, + "num_input_tokens_seen": 22136744, + "step": 33915 + }, + { + "epoch": 20.0, + "grad_norm": 4.857840538024902, + "learning_rate": 2.6475431291750342e-14, + "loss": 0.2493, + "num_input_tokens_seen": 22139032, + "step": 33920 + }, + { + "epoch": 20.0, + "eval_loss": 0.6312955617904663, + "eval_runtime": 19.3725, + "eval_samples_per_second": 87.547, + "eval_steps_per_second": 21.887, + "num_input_tokens_seen": 22139032, + "step": 33920 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 22139032, + "step": 33920, + "total_flos": 9.96910604825985e+17, + "train_loss": 0.4624784426517644, + "train_runtime": 3944.015, + "train_samples_per_second": 34.386, + "train_steps_per_second": 8.6 + } + ], + "logging_steps": 5, + "max_steps": 33920, + "num_input_tokens_seen": 22139032, + "num_train_epochs": 20, + "save_steps": 3392, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.96910604825985e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}