{ "best_global_step": 34632, "best_metric": 0.14623308181762695, "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_cola_456_1760637821/checkpoint-34632", "epoch": 20.0, "eval_steps": 1924, "global_step": 38480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002598752598752599, "grad_norm": 5.390900611877441, "learning_rate": 5.197505197505198e-08, "loss": 2.1137, "num_input_tokens_seen": 960, "step": 5 }, { "epoch": 0.005197505197505198, "grad_norm": 4.108017921447754, "learning_rate": 1.1694386694386695e-07, "loss": 1.0815, "num_input_tokens_seen": 1952, "step": 10 }, { "epoch": 0.007796257796257797, "grad_norm": 5.489694595336914, "learning_rate": 1.8191268191268192e-07, "loss": 1.681, "num_input_tokens_seen": 2976, "step": 15 }, { "epoch": 0.010395010395010396, "grad_norm": 5.464057922363281, "learning_rate": 2.468814968814969e-07, "loss": 1.9192, "num_input_tokens_seen": 3968, "step": 20 }, { "epoch": 0.012993762993762994, "grad_norm": 4.468008518218994, "learning_rate": 3.1185031185031186e-07, "loss": 1.7691, "num_input_tokens_seen": 4864, "step": 25 }, { "epoch": 0.015592515592515593, "grad_norm": 5.350846767425537, "learning_rate": 3.768191268191269e-07, "loss": 1.1956, "num_input_tokens_seen": 5792, "step": 30 }, { "epoch": 0.018191268191268192, "grad_norm": 5.2948737144470215, "learning_rate": 4.4178794178794185e-07, "loss": 1.5342, "num_input_tokens_seen": 6784, "step": 35 }, { "epoch": 0.02079002079002079, "grad_norm": 7.1127448081970215, "learning_rate": 5.067567567567568e-07, "loss": 1.3079, "num_input_tokens_seen": 7776, "step": 40 }, { "epoch": 0.02338877338877339, "grad_norm": 4.516308307647705, "learning_rate": 5.717255717255718e-07, "loss": 1.1775, "num_input_tokens_seen": 8736, "step": 45 }, { "epoch": 0.02598752598752599, "grad_norm": 5.124841690063477, "learning_rate": 6.366943866943868e-07, "loss": 1.6009, "num_input_tokens_seen": 9664, "step": 50 }, { "epoch": 0.028586278586278588, "grad_norm": 5.078517913818359, "learning_rate": 7.016632016632017e-07, "loss": 1.3078, "num_input_tokens_seen": 10656, "step": 55 }, { "epoch": 0.031185031185031187, "grad_norm": 4.927834987640381, "learning_rate": 7.666320166320167e-07, "loss": 1.3145, "num_input_tokens_seen": 11584, "step": 60 }, { "epoch": 0.033783783783783786, "grad_norm": 4.891884803771973, "learning_rate": 8.316008316008317e-07, "loss": 1.5184, "num_input_tokens_seen": 12480, "step": 65 }, { "epoch": 0.036382536382536385, "grad_norm": 4.2314229011535645, "learning_rate": 8.965696465696466e-07, "loss": 1.0893, "num_input_tokens_seen": 13376, "step": 70 }, { "epoch": 0.03898128898128898, "grad_norm": 5.31501579284668, "learning_rate": 9.615384615384617e-07, "loss": 1.3677, "num_input_tokens_seen": 14272, "step": 75 }, { "epoch": 0.04158004158004158, "grad_norm": 4.053516387939453, "learning_rate": 1.0265072765072766e-06, "loss": 1.3394, "num_input_tokens_seen": 15200, "step": 80 }, { "epoch": 0.04417879417879418, "grad_norm": 5.407421588897705, "learning_rate": 1.0914760914760916e-06, "loss": 1.7587, "num_input_tokens_seen": 16128, "step": 85 }, { "epoch": 0.04677754677754678, "grad_norm": 4.6009297370910645, "learning_rate": 1.1564449064449065e-06, "loss": 1.3611, "num_input_tokens_seen": 17088, "step": 90 }, { "epoch": 0.04937629937629938, "grad_norm": 4.211420059204102, "learning_rate": 1.2214137214137216e-06, "loss": 1.5119, "num_input_tokens_seen": 18016, "step": 95 }, { "epoch": 0.05197505197505198, "grad_norm": 5.090166091918945, "learning_rate": 1.2863825363825364e-06, "loss": 1.3283, "num_input_tokens_seen": 18976, "step": 100 }, { "epoch": 0.05457380457380458, "grad_norm": 5.226001739501953, "learning_rate": 1.3513513513513515e-06, "loss": 1.2739, "num_input_tokens_seen": 19936, "step": 105 }, { "epoch": 0.057172557172557176, "grad_norm": 4.967441558837891, "learning_rate": 1.4163201663201664e-06, "loss": 1.4597, "num_input_tokens_seen": 20960, "step": 110 }, { "epoch": 0.059771309771309775, "grad_norm": 4.622501850128174, "learning_rate": 1.4812889812889814e-06, "loss": 1.6943, "num_input_tokens_seen": 21952, "step": 115 }, { "epoch": 0.062370062370062374, "grad_norm": 5.382086277008057, "learning_rate": 1.5462577962577965e-06, "loss": 1.5935, "num_input_tokens_seen": 22976, "step": 120 }, { "epoch": 0.06496881496881497, "grad_norm": 4.761887073516846, "learning_rate": 1.6112266112266114e-06, "loss": 1.4063, "num_input_tokens_seen": 23936, "step": 125 }, { "epoch": 0.06756756756756757, "grad_norm": 5.615755558013916, "learning_rate": 1.6761954261954265e-06, "loss": 1.477, "num_input_tokens_seen": 24896, "step": 130 }, { "epoch": 0.07016632016632017, "grad_norm": 4.3888044357299805, "learning_rate": 1.7411642411642413e-06, "loss": 1.3631, "num_input_tokens_seen": 25824, "step": 135 }, { "epoch": 0.07276507276507277, "grad_norm": 4.965954780578613, "learning_rate": 1.8061330561330564e-06, "loss": 1.3002, "num_input_tokens_seen": 26720, "step": 140 }, { "epoch": 0.07536382536382537, "grad_norm": 5.265811920166016, "learning_rate": 1.8711018711018713e-06, "loss": 1.3947, "num_input_tokens_seen": 27584, "step": 145 }, { "epoch": 0.07796257796257797, "grad_norm": 4.747727870941162, "learning_rate": 1.9360706860706863e-06, "loss": 1.4056, "num_input_tokens_seen": 28576, "step": 150 }, { "epoch": 0.08056133056133057, "grad_norm": 5.738931655883789, "learning_rate": 2.0010395010395014e-06, "loss": 1.3329, "num_input_tokens_seen": 29536, "step": 155 }, { "epoch": 0.08316008316008316, "grad_norm": 4.395679473876953, "learning_rate": 2.066008316008316e-06, "loss": 1.1216, "num_input_tokens_seen": 30432, "step": 160 }, { "epoch": 0.08575883575883576, "grad_norm": 5.015817165374756, "learning_rate": 2.130977130977131e-06, "loss": 1.189, "num_input_tokens_seen": 31360, "step": 165 }, { "epoch": 0.08835758835758836, "grad_norm": 5.161103248596191, "learning_rate": 2.195945945945946e-06, "loss": 1.3894, "num_input_tokens_seen": 32256, "step": 170 }, { "epoch": 0.09095634095634096, "grad_norm": 4.077630519866943, "learning_rate": 2.2609147609147613e-06, "loss": 1.5356, "num_input_tokens_seen": 33120, "step": 175 }, { "epoch": 0.09355509355509356, "grad_norm": 5.670849800109863, "learning_rate": 2.325883575883576e-06, "loss": 1.7132, "num_input_tokens_seen": 34048, "step": 180 }, { "epoch": 0.09615384615384616, "grad_norm": 4.85869836807251, "learning_rate": 2.390852390852391e-06, "loss": 1.3681, "num_input_tokens_seen": 34976, "step": 185 }, { "epoch": 0.09875259875259876, "grad_norm": 4.798767566680908, "learning_rate": 2.455821205821206e-06, "loss": 1.3093, "num_input_tokens_seen": 35968, "step": 190 }, { "epoch": 0.10135135135135136, "grad_norm": 4.2810540199279785, "learning_rate": 2.520790020790021e-06, "loss": 1.1733, "num_input_tokens_seen": 36960, "step": 195 }, { "epoch": 0.10395010395010396, "grad_norm": 4.245293140411377, "learning_rate": 2.5857588357588362e-06, "loss": 1.5229, "num_input_tokens_seen": 37888, "step": 200 }, { "epoch": 0.10654885654885655, "grad_norm": 4.542383670806885, "learning_rate": 2.650727650727651e-06, "loss": 1.1705, "num_input_tokens_seen": 38880, "step": 205 }, { "epoch": 0.10914760914760915, "grad_norm": 5.018443584442139, "learning_rate": 2.715696465696466e-06, "loss": 1.7054, "num_input_tokens_seen": 39840, "step": 210 }, { "epoch": 0.11174636174636175, "grad_norm": 4.876917362213135, "learning_rate": 2.780665280665281e-06, "loss": 1.3018, "num_input_tokens_seen": 40736, "step": 215 }, { "epoch": 0.11434511434511435, "grad_norm": 4.903168678283691, "learning_rate": 2.845634095634096e-06, "loss": 1.2685, "num_input_tokens_seen": 41728, "step": 220 }, { "epoch": 0.11694386694386695, "grad_norm": 5.529719829559326, "learning_rate": 2.9106029106029107e-06, "loss": 2.0038, "num_input_tokens_seen": 42656, "step": 225 }, { "epoch": 0.11954261954261955, "grad_norm": 4.366907119750977, "learning_rate": 2.975571725571726e-06, "loss": 1.4704, "num_input_tokens_seen": 43616, "step": 230 }, { "epoch": 0.12214137214137215, "grad_norm": 4.276919364929199, "learning_rate": 3.040540540540541e-06, "loss": 1.1395, "num_input_tokens_seen": 44544, "step": 235 }, { "epoch": 0.12474012474012475, "grad_norm": 4.985826015472412, "learning_rate": 3.105509355509356e-06, "loss": 1.3822, "num_input_tokens_seen": 45440, "step": 240 }, { "epoch": 0.12733887733887733, "grad_norm": 3.476609706878662, "learning_rate": 3.1704781704781706e-06, "loss": 1.3046, "num_input_tokens_seen": 46400, "step": 245 }, { "epoch": 0.12993762993762994, "grad_norm": 4.192017078399658, "learning_rate": 3.2354469854469853e-06, "loss": 1.3256, "num_input_tokens_seen": 47360, "step": 250 }, { "epoch": 0.13253638253638253, "grad_norm": 3.905871629714966, "learning_rate": 3.3004158004158008e-06, "loss": 1.2828, "num_input_tokens_seen": 48352, "step": 255 }, { "epoch": 0.13513513513513514, "grad_norm": 4.52439022064209, "learning_rate": 3.3653846153846154e-06, "loss": 1.5133, "num_input_tokens_seen": 49280, "step": 260 }, { "epoch": 0.13773388773388773, "grad_norm": 4.239352703094482, "learning_rate": 3.430353430353431e-06, "loss": 1.3136, "num_input_tokens_seen": 50208, "step": 265 }, { "epoch": 0.14033264033264034, "grad_norm": 4.907905578613281, "learning_rate": 3.495322245322245e-06, "loss": 1.6679, "num_input_tokens_seen": 51136, "step": 270 }, { "epoch": 0.14293139293139293, "grad_norm": 4.544057369232178, "learning_rate": 3.5602910602910606e-06, "loss": 1.3045, "num_input_tokens_seen": 52064, "step": 275 }, { "epoch": 0.14553014553014554, "grad_norm": 4.806271553039551, "learning_rate": 3.6252598752598753e-06, "loss": 1.4569, "num_input_tokens_seen": 53024, "step": 280 }, { "epoch": 0.14812889812889812, "grad_norm": 4.126349925994873, "learning_rate": 3.6902286902286908e-06, "loss": 1.1014, "num_input_tokens_seen": 53984, "step": 285 }, { "epoch": 0.15072765072765074, "grad_norm": 5.079155445098877, "learning_rate": 3.7551975051975054e-06, "loss": 1.3949, "num_input_tokens_seen": 54976, "step": 290 }, { "epoch": 0.15332640332640332, "grad_norm": 4.6259684562683105, "learning_rate": 3.820166320166321e-06, "loss": 1.2977, "num_input_tokens_seen": 55904, "step": 295 }, { "epoch": 0.15592515592515593, "grad_norm": 4.361374855041504, "learning_rate": 3.885135135135135e-06, "loss": 1.7146, "num_input_tokens_seen": 56864, "step": 300 }, { "epoch": 0.15852390852390852, "grad_norm": 4.97282600402832, "learning_rate": 3.95010395010395e-06, "loss": 1.4164, "num_input_tokens_seen": 57760, "step": 305 }, { "epoch": 0.16112266112266113, "grad_norm": 4.699381351470947, "learning_rate": 4.015072765072765e-06, "loss": 1.3343, "num_input_tokens_seen": 58720, "step": 310 }, { "epoch": 0.16372141372141372, "grad_norm": 5.395305633544922, "learning_rate": 4.08004158004158e-06, "loss": 1.6583, "num_input_tokens_seen": 59680, "step": 315 }, { "epoch": 0.16632016632016633, "grad_norm": 4.437445640563965, "learning_rate": 4.1450103950103955e-06, "loss": 1.1167, "num_input_tokens_seen": 60672, "step": 320 }, { "epoch": 0.16891891891891891, "grad_norm": 4.626900672912598, "learning_rate": 4.2099792099792105e-06, "loss": 1.32, "num_input_tokens_seen": 61600, "step": 325 }, { "epoch": 0.17151767151767153, "grad_norm": 5.3768415451049805, "learning_rate": 4.274948024948025e-06, "loss": 1.4241, "num_input_tokens_seen": 62528, "step": 330 }, { "epoch": 0.1741164241164241, "grad_norm": 5.598320484161377, "learning_rate": 4.339916839916841e-06, "loss": 1.2792, "num_input_tokens_seen": 63424, "step": 335 }, { "epoch": 0.17671517671517672, "grad_norm": 4.19356632232666, "learning_rate": 4.404885654885655e-06, "loss": 1.5734, "num_input_tokens_seen": 64384, "step": 340 }, { "epoch": 0.1793139293139293, "grad_norm": 4.617959976196289, "learning_rate": 4.46985446985447e-06, "loss": 1.3601, "num_input_tokens_seen": 65376, "step": 345 }, { "epoch": 0.18191268191268192, "grad_norm": 4.874173641204834, "learning_rate": 4.534823284823285e-06, "loss": 1.2367, "num_input_tokens_seen": 66304, "step": 350 }, { "epoch": 0.1845114345114345, "grad_norm": 4.283167839050293, "learning_rate": 4.5997920997921e-06, "loss": 1.4771, "num_input_tokens_seen": 67232, "step": 355 }, { "epoch": 0.18711018711018712, "grad_norm": 5.339477062225342, "learning_rate": 4.664760914760915e-06, "loss": 1.02, "num_input_tokens_seen": 68160, "step": 360 }, { "epoch": 0.1897089397089397, "grad_norm": 4.631237983703613, "learning_rate": 4.72972972972973e-06, "loss": 1.6013, "num_input_tokens_seen": 69056, "step": 365 }, { "epoch": 0.19230769230769232, "grad_norm": 4.893244743347168, "learning_rate": 4.7946985446985445e-06, "loss": 1.8517, "num_input_tokens_seen": 70080, "step": 370 }, { "epoch": 0.1949064449064449, "grad_norm": 4.31270170211792, "learning_rate": 4.8596673596673604e-06, "loss": 1.1191, "num_input_tokens_seen": 71040, "step": 375 }, { "epoch": 0.19750519750519752, "grad_norm": 4.578235626220703, "learning_rate": 4.924636174636175e-06, "loss": 1.3228, "num_input_tokens_seen": 71968, "step": 380 }, { "epoch": 0.2001039501039501, "grad_norm": 5.901975631713867, "learning_rate": 4.98960498960499e-06, "loss": 1.6021, "num_input_tokens_seen": 72896, "step": 385 }, { "epoch": 0.20270270270270271, "grad_norm": 4.650174617767334, "learning_rate": 5.054573804573805e-06, "loss": 1.13, "num_input_tokens_seen": 73856, "step": 390 }, { "epoch": 0.2053014553014553, "grad_norm": 4.2407026290893555, "learning_rate": 5.11954261954262e-06, "loss": 1.0121, "num_input_tokens_seen": 74816, "step": 395 }, { "epoch": 0.2079002079002079, "grad_norm": 5.0105695724487305, "learning_rate": 5.184511434511435e-06, "loss": 1.3348, "num_input_tokens_seen": 75776, "step": 400 }, { "epoch": 0.2104989604989605, "grad_norm": 5.161824703216553, "learning_rate": 5.24948024948025e-06, "loss": 1.3664, "num_input_tokens_seen": 76704, "step": 405 }, { "epoch": 0.2130977130977131, "grad_norm": 4.53255033493042, "learning_rate": 5.314449064449064e-06, "loss": 1.3917, "num_input_tokens_seen": 77568, "step": 410 }, { "epoch": 0.2156964656964657, "grad_norm": 4.456571102142334, "learning_rate": 5.37941787941788e-06, "loss": 1.2604, "num_input_tokens_seen": 78496, "step": 415 }, { "epoch": 0.2182952182952183, "grad_norm": 4.443542003631592, "learning_rate": 5.444386694386694e-06, "loss": 1.364, "num_input_tokens_seen": 79456, "step": 420 }, { "epoch": 0.2208939708939709, "grad_norm": 4.97418737411499, "learning_rate": 5.50935550935551e-06, "loss": 1.1632, "num_input_tokens_seen": 80384, "step": 425 }, { "epoch": 0.2234927234927235, "grad_norm": 4.946002006530762, "learning_rate": 5.5743243243243245e-06, "loss": 1.7891, "num_input_tokens_seen": 81280, "step": 430 }, { "epoch": 0.2260914760914761, "grad_norm": 5.147231578826904, "learning_rate": 5.63929313929314e-06, "loss": 1.5949, "num_input_tokens_seen": 82272, "step": 435 }, { "epoch": 0.2286902286902287, "grad_norm": 4.587823867797852, "learning_rate": 5.704261954261955e-06, "loss": 1.4827, "num_input_tokens_seen": 83264, "step": 440 }, { "epoch": 0.2312889812889813, "grad_norm": 5.214271068572998, "learning_rate": 5.76923076923077e-06, "loss": 1.5216, "num_input_tokens_seen": 84224, "step": 445 }, { "epoch": 0.2338877338877339, "grad_norm": 4.847599029541016, "learning_rate": 5.834199584199585e-06, "loss": 1.6149, "num_input_tokens_seen": 85152, "step": 450 }, { "epoch": 0.23648648648648649, "grad_norm": 4.1367573738098145, "learning_rate": 5.8991683991684e-06, "loss": 1.5695, "num_input_tokens_seen": 86080, "step": 455 }, { "epoch": 0.2390852390852391, "grad_norm": 5.126263618469238, "learning_rate": 5.964137214137214e-06, "loss": 1.1811, "num_input_tokens_seen": 87040, "step": 460 }, { "epoch": 0.24168399168399168, "grad_norm": 4.153438091278076, "learning_rate": 6.02910602910603e-06, "loss": 1.3844, "num_input_tokens_seen": 87968, "step": 465 }, { "epoch": 0.2442827442827443, "grad_norm": 4.388014793395996, "learning_rate": 6.094074844074844e-06, "loss": 1.2354, "num_input_tokens_seen": 88992, "step": 470 }, { "epoch": 0.24688149688149688, "grad_norm": 6.016684532165527, "learning_rate": 6.159043659043659e-06, "loss": 1.7011, "num_input_tokens_seen": 89984, "step": 475 }, { "epoch": 0.2494802494802495, "grad_norm": 4.685962677001953, "learning_rate": 6.2240124740124744e-06, "loss": 1.3419, "num_input_tokens_seen": 91008, "step": 480 }, { "epoch": 0.2520790020790021, "grad_norm": 4.750699996948242, "learning_rate": 6.2889812889812895e-06, "loss": 1.665, "num_input_tokens_seen": 92000, "step": 485 }, { "epoch": 0.25467775467775466, "grad_norm": 5.93297004699707, "learning_rate": 6.353950103950105e-06, "loss": 1.3007, "num_input_tokens_seen": 92928, "step": 490 }, { "epoch": 0.25727650727650725, "grad_norm": 4.9737396240234375, "learning_rate": 6.418918918918919e-06, "loss": 1.9481, "num_input_tokens_seen": 93952, "step": 495 }, { "epoch": 0.2598752598752599, "grad_norm": 4.48522424697876, "learning_rate": 6.483887733887734e-06, "loss": 1.4173, "num_input_tokens_seen": 94880, "step": 500 }, { "epoch": 0.2624740124740125, "grad_norm": 4.169576168060303, "learning_rate": 6.54885654885655e-06, "loss": 1.4621, "num_input_tokens_seen": 95744, "step": 505 }, { "epoch": 0.26507276507276506, "grad_norm": 4.166291236877441, "learning_rate": 6.613825363825364e-06, "loss": 1.1065, "num_input_tokens_seen": 96704, "step": 510 }, { "epoch": 0.26767151767151764, "grad_norm": 5.386135101318359, "learning_rate": 6.678794178794179e-06, "loss": 1.057, "num_input_tokens_seen": 97728, "step": 515 }, { "epoch": 0.2702702702702703, "grad_norm": 4.488414764404297, "learning_rate": 6.743762993762993e-06, "loss": 1.5113, "num_input_tokens_seen": 98688, "step": 520 }, { "epoch": 0.27286902286902287, "grad_norm": 4.5046210289001465, "learning_rate": 6.808731808731809e-06, "loss": 1.6482, "num_input_tokens_seen": 99648, "step": 525 }, { "epoch": 0.27546777546777546, "grad_norm": 5.109572410583496, "learning_rate": 6.873700623700624e-06, "loss": 1.7921, "num_input_tokens_seen": 100544, "step": 530 }, { "epoch": 0.27806652806652804, "grad_norm": 4.249133586883545, "learning_rate": 6.9386694386694386e-06, "loss": 1.4071, "num_input_tokens_seen": 101472, "step": 535 }, { "epoch": 0.2806652806652807, "grad_norm": 4.243518829345703, "learning_rate": 7.003638253638254e-06, "loss": 1.3702, "num_input_tokens_seen": 102400, "step": 540 }, { "epoch": 0.28326403326403327, "grad_norm": 4.779784679412842, "learning_rate": 7.0686070686070696e-06, "loss": 1.1136, "num_input_tokens_seen": 103424, "step": 545 }, { "epoch": 0.28586278586278585, "grad_norm": 4.956280708312988, "learning_rate": 7.133575883575884e-06, "loss": 1.3721, "num_input_tokens_seen": 104416, "step": 550 }, { "epoch": 0.28846153846153844, "grad_norm": 4.695742607116699, "learning_rate": 7.198544698544699e-06, "loss": 1.0324, "num_input_tokens_seen": 105312, "step": 555 }, { "epoch": 0.2910602910602911, "grad_norm": 5.507964611053467, "learning_rate": 7.263513513513513e-06, "loss": 1.3428, "num_input_tokens_seen": 106304, "step": 560 }, { "epoch": 0.29365904365904366, "grad_norm": 4.216185569763184, "learning_rate": 7.328482328482329e-06, "loss": 1.2205, "num_input_tokens_seen": 107200, "step": 565 }, { "epoch": 0.29625779625779625, "grad_norm": 4.232551574707031, "learning_rate": 7.393451143451144e-06, "loss": 1.4006, "num_input_tokens_seen": 108160, "step": 570 }, { "epoch": 0.29885654885654883, "grad_norm": 4.907680988311768, "learning_rate": 7.458419958419958e-06, "loss": 1.2645, "num_input_tokens_seen": 109120, "step": 575 }, { "epoch": 0.30145530145530147, "grad_norm": 4.057405471801758, "learning_rate": 7.523388773388773e-06, "loss": 1.1994, "num_input_tokens_seen": 110112, "step": 580 }, { "epoch": 0.30405405405405406, "grad_norm": 4.081585884094238, "learning_rate": 7.588357588357589e-06, "loss": 1.1014, "num_input_tokens_seen": 111072, "step": 585 }, { "epoch": 0.30665280665280664, "grad_norm": 3.5816824436187744, "learning_rate": 7.653326403326404e-06, "loss": 1.1502, "num_input_tokens_seen": 112032, "step": 590 }, { "epoch": 0.3092515592515592, "grad_norm": 3.243480682373047, "learning_rate": 7.718295218295219e-06, "loss": 0.6955, "num_input_tokens_seen": 112992, "step": 595 }, { "epoch": 0.31185031185031187, "grad_norm": 3.6461236476898193, "learning_rate": 7.783264033264033e-06, "loss": 0.8044, "num_input_tokens_seen": 113984, "step": 600 }, { "epoch": 0.31444906444906445, "grad_norm": 4.157686710357666, "learning_rate": 7.848232848232849e-06, "loss": 1.0427, "num_input_tokens_seen": 114976, "step": 605 }, { "epoch": 0.31704781704781704, "grad_norm": 2.9544625282287598, "learning_rate": 7.913201663201663e-06, "loss": 0.9465, "num_input_tokens_seen": 116096, "step": 610 }, { "epoch": 0.3196465696465696, "grad_norm": 2.8394696712493896, "learning_rate": 7.978170478170479e-06, "loss": 0.6735, "num_input_tokens_seen": 117024, "step": 615 }, { "epoch": 0.32224532224532226, "grad_norm": 2.17812180519104, "learning_rate": 8.043139293139293e-06, "loss": 0.6211, "num_input_tokens_seen": 117984, "step": 620 }, { "epoch": 0.32484407484407485, "grad_norm": 4.285351753234863, "learning_rate": 8.108108108108109e-06, "loss": 0.88, "num_input_tokens_seen": 118912, "step": 625 }, { "epoch": 0.32744282744282743, "grad_norm": 3.1406471729278564, "learning_rate": 8.173076923076923e-06, "loss": 0.5626, "num_input_tokens_seen": 119840, "step": 630 }, { "epoch": 0.33004158004158, "grad_norm": 2.646674871444702, "learning_rate": 8.238045738045738e-06, "loss": 0.8788, "num_input_tokens_seen": 120768, "step": 635 }, { "epoch": 0.33264033264033266, "grad_norm": 4.336171627044678, "learning_rate": 8.303014553014553e-06, "loss": 1.0893, "num_input_tokens_seen": 121824, "step": 640 }, { "epoch": 0.33523908523908524, "grad_norm": 1.8497097492218018, "learning_rate": 8.36798336798337e-06, "loss": 0.3269, "num_input_tokens_seen": 122720, "step": 645 }, { "epoch": 0.33783783783783783, "grad_norm": 1.2917816638946533, "learning_rate": 8.432952182952184e-06, "loss": 0.7756, "num_input_tokens_seen": 123648, "step": 650 }, { "epoch": 0.3404365904365904, "grad_norm": 1.4513083696365356, "learning_rate": 8.497920997920998e-06, "loss": 0.4023, "num_input_tokens_seen": 124640, "step": 655 }, { "epoch": 0.34303534303534305, "grad_norm": 2.126418113708496, "learning_rate": 8.562889812889812e-06, "loss": 1.0679, "num_input_tokens_seen": 125600, "step": 660 }, { "epoch": 0.34563409563409564, "grad_norm": 1.2936424016952515, "learning_rate": 8.627858627858628e-06, "loss": 0.5039, "num_input_tokens_seen": 126528, "step": 665 }, { "epoch": 0.3482328482328482, "grad_norm": 2.5757479667663574, "learning_rate": 8.692827442827444e-06, "loss": 0.6153, "num_input_tokens_seen": 127520, "step": 670 }, { "epoch": 0.3508316008316008, "grad_norm": 0.8505427241325378, "learning_rate": 8.757796257796258e-06, "loss": 0.3976, "num_input_tokens_seen": 128416, "step": 675 }, { "epoch": 0.35343035343035345, "grad_norm": 1.2440232038497925, "learning_rate": 8.822765072765072e-06, "loss": 0.271, "num_input_tokens_seen": 129440, "step": 680 }, { "epoch": 0.35602910602910603, "grad_norm": 3.4699935913085938, "learning_rate": 8.887733887733888e-06, "loss": 0.9601, "num_input_tokens_seen": 130432, "step": 685 }, { "epoch": 0.3586278586278586, "grad_norm": 2.076303005218506, "learning_rate": 8.952702702702702e-06, "loss": 0.6837, "num_input_tokens_seen": 131392, "step": 690 }, { "epoch": 0.3612266112266112, "grad_norm": 1.4844729900360107, "learning_rate": 9.017671517671518e-06, "loss": 0.4798, "num_input_tokens_seen": 132320, "step": 695 }, { "epoch": 0.36382536382536385, "grad_norm": 2.340196132659912, "learning_rate": 9.082640332640333e-06, "loss": 0.5671, "num_input_tokens_seen": 133152, "step": 700 }, { "epoch": 0.36642411642411643, "grad_norm": 2.232010841369629, "learning_rate": 9.147609147609149e-06, "loss": 0.3383, "num_input_tokens_seen": 134144, "step": 705 }, { "epoch": 0.369022869022869, "grad_norm": 2.9583256244659424, "learning_rate": 9.212577962577963e-06, "loss": 0.6047, "num_input_tokens_seen": 135008, "step": 710 }, { "epoch": 0.3716216216216216, "grad_norm": 3.2734920978546143, "learning_rate": 9.277546777546777e-06, "loss": 0.6505, "num_input_tokens_seen": 135936, "step": 715 }, { "epoch": 0.37422037422037424, "grad_norm": 3.8538808822631836, "learning_rate": 9.342515592515593e-06, "loss": 0.6104, "num_input_tokens_seen": 136928, "step": 720 }, { "epoch": 0.3768191268191268, "grad_norm": 1.0706907510757446, "learning_rate": 9.407484407484409e-06, "loss": 0.2644, "num_input_tokens_seen": 137888, "step": 725 }, { "epoch": 0.3794178794178794, "grad_norm": 4.750820159912109, "learning_rate": 9.472453222453223e-06, "loss": 0.6761, "num_input_tokens_seen": 138880, "step": 730 }, { "epoch": 0.382016632016632, "grad_norm": 0.8215144276618958, "learning_rate": 9.537422037422037e-06, "loss": 0.5843, "num_input_tokens_seen": 139872, "step": 735 }, { "epoch": 0.38461538461538464, "grad_norm": 1.9783040285110474, "learning_rate": 9.602390852390852e-06, "loss": 0.2, "num_input_tokens_seen": 140768, "step": 740 }, { "epoch": 0.3872141372141372, "grad_norm": 3.3352670669555664, "learning_rate": 9.667359667359669e-06, "loss": 1.0062, "num_input_tokens_seen": 141760, "step": 745 }, { "epoch": 0.3898128898128898, "grad_norm": 0.19533170759677887, "learning_rate": 9.732328482328483e-06, "loss": 0.3925, "num_input_tokens_seen": 142688, "step": 750 }, { "epoch": 0.3924116424116424, "grad_norm": 3.272860527038574, "learning_rate": 9.797297297297298e-06, "loss": 0.216, "num_input_tokens_seen": 143680, "step": 755 }, { "epoch": 0.39501039501039503, "grad_norm": 2.8174352645874023, "learning_rate": 9.862266112266112e-06, "loss": 0.5414, "num_input_tokens_seen": 144672, "step": 760 }, { "epoch": 0.3976091476091476, "grad_norm": 3.735753059387207, "learning_rate": 9.927234927234928e-06, "loss": 0.2889, "num_input_tokens_seen": 145536, "step": 765 }, { "epoch": 0.4002079002079002, "grad_norm": 0.3371600806713104, "learning_rate": 9.992203742203744e-06, "loss": 0.4749, "num_input_tokens_seen": 146464, "step": 770 }, { "epoch": 0.4028066528066528, "grad_norm": 2.4817726612091064, "learning_rate": 1.0057172557172558e-05, "loss": 0.4759, "num_input_tokens_seen": 147520, "step": 775 }, { "epoch": 0.40540540540540543, "grad_norm": 3.5351004600524902, "learning_rate": 1.0122141372141372e-05, "loss": 0.4036, "num_input_tokens_seen": 148480, "step": 780 }, { "epoch": 0.408004158004158, "grad_norm": 1.82395601272583, "learning_rate": 1.0187110187110188e-05, "loss": 0.1976, "num_input_tokens_seen": 149440, "step": 785 }, { "epoch": 0.4106029106029106, "grad_norm": 0.4408630430698395, "learning_rate": 1.0252079002079002e-05, "loss": 0.4456, "num_input_tokens_seen": 150336, "step": 790 }, { "epoch": 0.4132016632016632, "grad_norm": 0.3262403607368469, "learning_rate": 1.0317047817047818e-05, "loss": 0.1852, "num_input_tokens_seen": 151232, "step": 795 }, { "epoch": 0.4158004158004158, "grad_norm": 1.8000391721725464, "learning_rate": 1.0382016632016632e-05, "loss": 0.4057, "num_input_tokens_seen": 152160, "step": 800 }, { "epoch": 0.4183991683991684, "grad_norm": 1.025941252708435, "learning_rate": 1.0446985446985448e-05, "loss": 0.314, "num_input_tokens_seen": 153120, "step": 805 }, { "epoch": 0.420997920997921, "grad_norm": 2.9086601734161377, "learning_rate": 1.0511954261954263e-05, "loss": 0.4367, "num_input_tokens_seen": 154112, "step": 810 }, { "epoch": 0.4235966735966736, "grad_norm": 1.5982004404067993, "learning_rate": 1.0576923076923077e-05, "loss": 0.453, "num_input_tokens_seen": 155008, "step": 815 }, { "epoch": 0.4261954261954262, "grad_norm": 0.7515773773193359, "learning_rate": 1.0641891891891893e-05, "loss": 0.1357, "num_input_tokens_seen": 156000, "step": 820 }, { "epoch": 0.4287941787941788, "grad_norm": 3.160454511642456, "learning_rate": 1.0706860706860709e-05, "loss": 0.4098, "num_input_tokens_seen": 156928, "step": 825 }, { "epoch": 0.4313929313929314, "grad_norm": 2.9594950675964355, "learning_rate": 1.0771829521829523e-05, "loss": 0.4344, "num_input_tokens_seen": 157888, "step": 830 }, { "epoch": 0.433991683991684, "grad_norm": 1.356125831604004, "learning_rate": 1.0836798336798337e-05, "loss": 0.4487, "num_input_tokens_seen": 158848, "step": 835 }, { "epoch": 0.4365904365904366, "grad_norm": 1.5019943714141846, "learning_rate": 1.0901767151767151e-05, "loss": 0.4532, "num_input_tokens_seen": 159776, "step": 840 }, { "epoch": 0.4391891891891892, "grad_norm": 0.2829939126968384, "learning_rate": 1.0966735966735967e-05, "loss": 0.4106, "num_input_tokens_seen": 160800, "step": 845 }, { "epoch": 0.4417879417879418, "grad_norm": 0.11285495012998581, "learning_rate": 1.1031704781704783e-05, "loss": 0.4015, "num_input_tokens_seen": 161792, "step": 850 }, { "epoch": 0.44438669438669437, "grad_norm": 2.022071361541748, "learning_rate": 1.1096673596673597e-05, "loss": 0.2682, "num_input_tokens_seen": 162720, "step": 855 }, { "epoch": 0.446985446985447, "grad_norm": 2.0564653873443604, "learning_rate": 1.1161642411642412e-05, "loss": 0.5684, "num_input_tokens_seen": 163680, "step": 860 }, { "epoch": 0.4495841995841996, "grad_norm": 2.9112672805786133, "learning_rate": 1.1226611226611228e-05, "loss": 0.5395, "num_input_tokens_seen": 164672, "step": 865 }, { "epoch": 0.4521829521829522, "grad_norm": 0.26752808690071106, "learning_rate": 1.1291580041580042e-05, "loss": 0.2689, "num_input_tokens_seen": 165568, "step": 870 }, { "epoch": 0.45478170478170477, "grad_norm": 1.3288511037826538, "learning_rate": 1.1356548856548858e-05, "loss": 0.3476, "num_input_tokens_seen": 166496, "step": 875 }, { "epoch": 0.4573804573804574, "grad_norm": 0.44646117091178894, "learning_rate": 1.1421517671517672e-05, "loss": 0.4811, "num_input_tokens_seen": 167456, "step": 880 }, { "epoch": 0.45997920997921, "grad_norm": 0.34344568848609924, "learning_rate": 1.1486486486486488e-05, "loss": 0.343, "num_input_tokens_seen": 168416, "step": 885 }, { "epoch": 0.4625779625779626, "grad_norm": 1.3743458986282349, "learning_rate": 1.1551455301455302e-05, "loss": 0.3567, "num_input_tokens_seen": 169312, "step": 890 }, { "epoch": 0.46517671517671516, "grad_norm": 1.8233109712600708, "learning_rate": 1.1616424116424116e-05, "loss": 0.4487, "num_input_tokens_seen": 170272, "step": 895 }, { "epoch": 0.4677754677754678, "grad_norm": 2.1519787311553955, "learning_rate": 1.1681392931392932e-05, "loss": 0.3058, "num_input_tokens_seen": 171136, "step": 900 }, { "epoch": 0.4703742203742204, "grad_norm": 0.8772083520889282, "learning_rate": 1.1746361746361748e-05, "loss": 0.1231, "num_input_tokens_seen": 172032, "step": 905 }, { "epoch": 0.47297297297297297, "grad_norm": 1.8867658376693726, "learning_rate": 1.1811330561330562e-05, "loss": 0.4516, "num_input_tokens_seen": 172992, "step": 910 }, { "epoch": 0.47557172557172556, "grad_norm": 1.3940728902816772, "learning_rate": 1.1876299376299377e-05, "loss": 0.4094, "num_input_tokens_seen": 173952, "step": 915 }, { "epoch": 0.4781704781704782, "grad_norm": 2.1370553970336914, "learning_rate": 1.194126819126819e-05, "loss": 0.4099, "num_input_tokens_seen": 174848, "step": 920 }, { "epoch": 0.4807692307692308, "grad_norm": 1.6183445453643799, "learning_rate": 1.2006237006237007e-05, "loss": 0.4286, "num_input_tokens_seen": 175776, "step": 925 }, { "epoch": 0.48336798336798337, "grad_norm": 0.1200045496225357, "learning_rate": 1.2071205821205823e-05, "loss": 0.3136, "num_input_tokens_seen": 176864, "step": 930 }, { "epoch": 0.48596673596673595, "grad_norm": 0.23789368569850922, "learning_rate": 1.2136174636174637e-05, "loss": 0.3767, "num_input_tokens_seen": 177856, "step": 935 }, { "epoch": 0.4885654885654886, "grad_norm": 2.7908105850219727, "learning_rate": 1.2201143451143451e-05, "loss": 0.3064, "num_input_tokens_seen": 178848, "step": 940 }, { "epoch": 0.4911642411642412, "grad_norm": 3.4652607440948486, "learning_rate": 1.2266112266112267e-05, "loss": 0.5884, "num_input_tokens_seen": 179872, "step": 945 }, { "epoch": 0.49376299376299376, "grad_norm": 0.9001139402389526, "learning_rate": 1.2331081081081081e-05, "loss": 0.4918, "num_input_tokens_seen": 180704, "step": 950 }, { "epoch": 0.49636174636174635, "grad_norm": 1.5649081468582153, "learning_rate": 1.2396049896049897e-05, "loss": 0.1717, "num_input_tokens_seen": 181568, "step": 955 }, { "epoch": 0.498960498960499, "grad_norm": 3.5749831199645996, "learning_rate": 1.2461018711018711e-05, "loss": 0.3197, "num_input_tokens_seen": 182592, "step": 960 }, { "epoch": 0.5015592515592515, "grad_norm": 2.761462688446045, "learning_rate": 1.2525987525987526e-05, "loss": 0.2191, "num_input_tokens_seen": 183616, "step": 965 }, { "epoch": 0.5041580041580042, "grad_norm": 0.13608603179454803, "learning_rate": 1.2590956340956342e-05, "loss": 0.2669, "num_input_tokens_seen": 184608, "step": 970 }, { "epoch": 0.5067567567567568, "grad_norm": 0.1911783218383789, "learning_rate": 1.2655925155925156e-05, "loss": 0.3562, "num_input_tokens_seen": 185536, "step": 975 }, { "epoch": 0.5093555093555093, "grad_norm": 0.9213443398475647, "learning_rate": 1.2720893970893972e-05, "loss": 0.3366, "num_input_tokens_seen": 186560, "step": 980 }, { "epoch": 0.511954261954262, "grad_norm": 3.2217047214508057, "learning_rate": 1.2785862785862788e-05, "loss": 0.4887, "num_input_tokens_seen": 187552, "step": 985 }, { "epoch": 0.5145530145530145, "grad_norm": 3.460385322570801, "learning_rate": 1.28508316008316e-05, "loss": 0.2911, "num_input_tokens_seen": 188576, "step": 990 }, { "epoch": 0.5171517671517671, "grad_norm": 2.6959164142608643, "learning_rate": 1.2915800415800416e-05, "loss": 0.3361, "num_input_tokens_seen": 189472, "step": 995 }, { "epoch": 0.5197505197505198, "grad_norm": 0.06926345080137253, "learning_rate": 1.2980769230769232e-05, "loss": 0.3709, "num_input_tokens_seen": 190368, "step": 1000 }, { "epoch": 0.5223492723492723, "grad_norm": 0.10815850645303726, "learning_rate": 1.3045738045738046e-05, "loss": 0.0203, "num_input_tokens_seen": 191296, "step": 1005 }, { "epoch": 0.524948024948025, "grad_norm": 2.8996741771698, "learning_rate": 1.3110706860706862e-05, "loss": 0.6673, "num_input_tokens_seen": 192192, "step": 1010 }, { "epoch": 0.5275467775467776, "grad_norm": 0.7198574542999268, "learning_rate": 1.3175675675675675e-05, "loss": 0.3606, "num_input_tokens_seen": 193152, "step": 1015 }, { "epoch": 0.5301455301455301, "grad_norm": 5.24624490737915, "learning_rate": 1.324064449064449e-05, "loss": 0.729, "num_input_tokens_seen": 194112, "step": 1020 }, { "epoch": 0.5327442827442828, "grad_norm": 2.219195604324341, "learning_rate": 1.3305613305613307e-05, "loss": 0.2439, "num_input_tokens_seen": 195072, "step": 1025 }, { "epoch": 0.5353430353430353, "grad_norm": 1.2136561870574951, "learning_rate": 1.337058212058212e-05, "loss": 0.2235, "num_input_tokens_seen": 195936, "step": 1030 }, { "epoch": 0.5379417879417879, "grad_norm": 0.23627012968063354, "learning_rate": 1.3435550935550937e-05, "loss": 0.4322, "num_input_tokens_seen": 196864, "step": 1035 }, { "epoch": 0.5405405405405406, "grad_norm": 0.0813111811876297, "learning_rate": 1.3500519750519753e-05, "loss": 0.2937, "num_input_tokens_seen": 197888, "step": 1040 }, { "epoch": 0.5431392931392931, "grad_norm": 1.0023311376571655, "learning_rate": 1.3565488565488565e-05, "loss": 0.1359, "num_input_tokens_seen": 198816, "step": 1045 }, { "epoch": 0.5457380457380457, "grad_norm": 0.24974097311496735, "learning_rate": 1.3630457380457381e-05, "loss": 0.4252, "num_input_tokens_seen": 199744, "step": 1050 }, { "epoch": 0.5483367983367984, "grad_norm": 0.18767845630645752, "learning_rate": 1.3695426195426195e-05, "loss": 0.2602, "num_input_tokens_seen": 200704, "step": 1055 }, { "epoch": 0.5509355509355509, "grad_norm": 0.2750694155693054, "learning_rate": 1.3760395010395011e-05, "loss": 0.115, "num_input_tokens_seen": 201664, "step": 1060 }, { "epoch": 0.5535343035343036, "grad_norm": 0.29694247245788574, "learning_rate": 1.3825363825363827e-05, "loss": 0.4572, "num_input_tokens_seen": 202624, "step": 1065 }, { "epoch": 0.5561330561330561, "grad_norm": 2.461657762527466, "learning_rate": 1.389033264033264e-05, "loss": 0.0809, "num_input_tokens_seen": 203520, "step": 1070 }, { "epoch": 0.5587318087318087, "grad_norm": 3.584895133972168, "learning_rate": 1.3955301455301456e-05, "loss": 0.2359, "num_input_tokens_seen": 204512, "step": 1075 }, { "epoch": 0.5613305613305614, "grad_norm": 1.9665179252624512, "learning_rate": 1.4020270270270271e-05, "loss": 0.2572, "num_input_tokens_seen": 205472, "step": 1080 }, { "epoch": 0.5639293139293139, "grad_norm": 2.27067494392395, "learning_rate": 1.4085239085239086e-05, "loss": 0.4012, "num_input_tokens_seen": 206528, "step": 1085 }, { "epoch": 0.5665280665280665, "grad_norm": 0.06383416056632996, "learning_rate": 1.4150207900207902e-05, "loss": 0.2503, "num_input_tokens_seen": 207424, "step": 1090 }, { "epoch": 0.5691268191268192, "grad_norm": 0.2316725105047226, "learning_rate": 1.4215176715176714e-05, "loss": 0.081, "num_input_tokens_seen": 208416, "step": 1095 }, { "epoch": 0.5717255717255717, "grad_norm": 1.4198354482650757, "learning_rate": 1.428014553014553e-05, "loss": 0.2873, "num_input_tokens_seen": 209408, "step": 1100 }, { "epoch": 0.5743243243243243, "grad_norm": 2.3356399536132812, "learning_rate": 1.4345114345114346e-05, "loss": 0.3881, "num_input_tokens_seen": 210432, "step": 1105 }, { "epoch": 0.5769230769230769, "grad_norm": 1.8762314319610596, "learning_rate": 1.441008316008316e-05, "loss": 0.0864, "num_input_tokens_seen": 211360, "step": 1110 }, { "epoch": 0.5795218295218295, "grad_norm": 3.792170286178589, "learning_rate": 1.4475051975051976e-05, "loss": 0.3045, "num_input_tokens_seen": 212288, "step": 1115 }, { "epoch": 0.5821205821205822, "grad_norm": 1.4448484182357788, "learning_rate": 1.4540020790020792e-05, "loss": 0.33, "num_input_tokens_seen": 213216, "step": 1120 }, { "epoch": 0.5847193347193347, "grad_norm": 2.017183542251587, "learning_rate": 1.4604989604989605e-05, "loss": 0.31, "num_input_tokens_seen": 214208, "step": 1125 }, { "epoch": 0.5873180873180873, "grad_norm": 1.102732539176941, "learning_rate": 1.466995841995842e-05, "loss": 0.3037, "num_input_tokens_seen": 215232, "step": 1130 }, { "epoch": 0.58991683991684, "grad_norm": 0.5515732169151306, "learning_rate": 1.4734927234927235e-05, "loss": 0.3705, "num_input_tokens_seen": 216096, "step": 1135 }, { "epoch": 0.5925155925155925, "grad_norm": 3.4831414222717285, "learning_rate": 1.479989604989605e-05, "loss": 0.2918, "num_input_tokens_seen": 217024, "step": 1140 }, { "epoch": 0.5951143451143451, "grad_norm": 2.153735876083374, "learning_rate": 1.4864864864864867e-05, "loss": 0.1559, "num_input_tokens_seen": 217920, "step": 1145 }, { "epoch": 0.5977130977130977, "grad_norm": 1.898769736289978, "learning_rate": 1.4929833679833679e-05, "loss": 0.5104, "num_input_tokens_seen": 218816, "step": 1150 }, { "epoch": 0.6003118503118503, "grad_norm": 2.3270387649536133, "learning_rate": 1.4994802494802495e-05, "loss": 0.3781, "num_input_tokens_seen": 219776, "step": 1155 }, { "epoch": 0.6029106029106029, "grad_norm": 0.11943051218986511, "learning_rate": 1.5059771309771311e-05, "loss": 0.2502, "num_input_tokens_seen": 220704, "step": 1160 }, { "epoch": 0.6055093555093555, "grad_norm": 1.4951125383377075, "learning_rate": 1.5124740124740125e-05, "loss": 0.2797, "num_input_tokens_seen": 221792, "step": 1165 }, { "epoch": 0.6081081081081081, "grad_norm": 3.1852686405181885, "learning_rate": 1.5189708939708941e-05, "loss": 0.7419, "num_input_tokens_seen": 222784, "step": 1170 }, { "epoch": 0.6107068607068608, "grad_norm": 5.521493911743164, "learning_rate": 1.5254677754677754e-05, "loss": 0.392, "num_input_tokens_seen": 223744, "step": 1175 }, { "epoch": 0.6133056133056133, "grad_norm": 1.6604697704315186, "learning_rate": 1.531964656964657e-05, "loss": 0.1593, "num_input_tokens_seen": 224736, "step": 1180 }, { "epoch": 0.6159043659043659, "grad_norm": 0.1664881408214569, "learning_rate": 1.5384615384615387e-05, "loss": 0.2469, "num_input_tokens_seen": 225696, "step": 1185 }, { "epoch": 0.6185031185031185, "grad_norm": 0.13834740221500397, "learning_rate": 1.54495841995842e-05, "loss": 0.3126, "num_input_tokens_seen": 226688, "step": 1190 }, { "epoch": 0.6211018711018711, "grad_norm": 0.0880691185593605, "learning_rate": 1.5514553014553016e-05, "loss": 0.1982, "num_input_tokens_seen": 227680, "step": 1195 }, { "epoch": 0.6237006237006237, "grad_norm": 0.3053607940673828, "learning_rate": 1.557952182952183e-05, "loss": 0.2087, "num_input_tokens_seen": 228576, "step": 1200 }, { "epoch": 0.6262993762993763, "grad_norm": 1.0278663635253906, "learning_rate": 1.5644490644490644e-05, "loss": 0.1441, "num_input_tokens_seen": 229504, "step": 1205 }, { "epoch": 0.6288981288981289, "grad_norm": 0.7063376903533936, "learning_rate": 1.570945945945946e-05, "loss": 0.124, "num_input_tokens_seen": 230464, "step": 1210 }, { "epoch": 0.6314968814968815, "grad_norm": 0.837662935256958, "learning_rate": 1.5774428274428273e-05, "loss": 0.1808, "num_input_tokens_seen": 231456, "step": 1215 }, { "epoch": 0.6340956340956341, "grad_norm": 0.08059851825237274, "learning_rate": 1.583939708939709e-05, "loss": 0.3585, "num_input_tokens_seen": 232320, "step": 1220 }, { "epoch": 0.6366943866943867, "grad_norm": 2.420978307723999, "learning_rate": 1.5904365904365904e-05, "loss": 0.2376, "num_input_tokens_seen": 233184, "step": 1225 }, { "epoch": 0.6392931392931392, "grad_norm": 1.5825105905532837, "learning_rate": 1.596933471933472e-05, "loss": 0.338, "num_input_tokens_seen": 234112, "step": 1230 }, { "epoch": 0.6418918918918919, "grad_norm": 2.2923030853271484, "learning_rate": 1.6034303534303536e-05, "loss": 0.3006, "num_input_tokens_seen": 235040, "step": 1235 }, { "epoch": 0.6444906444906445, "grad_norm": 0.8626063466072083, "learning_rate": 1.6099272349272352e-05, "loss": 0.2039, "num_input_tokens_seen": 236032, "step": 1240 }, { "epoch": 0.6470893970893971, "grad_norm": 1.056456446647644, "learning_rate": 1.6164241164241165e-05, "loss": 0.2783, "num_input_tokens_seen": 236992, "step": 1245 }, { "epoch": 0.6496881496881497, "grad_norm": 0.07889572530984879, "learning_rate": 1.622920997920998e-05, "loss": 0.0536, "num_input_tokens_seen": 238016, "step": 1250 }, { "epoch": 0.6522869022869023, "grad_norm": 1.508341670036316, "learning_rate": 1.6294178794178793e-05, "loss": 0.3023, "num_input_tokens_seen": 238944, "step": 1255 }, { "epoch": 0.6548856548856549, "grad_norm": 0.11475363373756409, "learning_rate": 1.635914760914761e-05, "loss": 0.298, "num_input_tokens_seen": 239904, "step": 1260 }, { "epoch": 0.6574844074844075, "grad_norm": 0.17680664360523224, "learning_rate": 1.6424116424116425e-05, "loss": 0.2526, "num_input_tokens_seen": 240832, "step": 1265 }, { "epoch": 0.66008316008316, "grad_norm": 0.6226297616958618, "learning_rate": 1.6489085239085238e-05, "loss": 0.2256, "num_input_tokens_seen": 241824, "step": 1270 }, { "epoch": 0.6626819126819127, "grad_norm": 1.2153046131134033, "learning_rate": 1.6554054054054053e-05, "loss": 0.485, "num_input_tokens_seen": 242816, "step": 1275 }, { "epoch": 0.6652806652806653, "grad_norm": 2.2635958194732666, "learning_rate": 1.6619022869022873e-05, "loss": 0.3629, "num_input_tokens_seen": 243808, "step": 1280 }, { "epoch": 0.6678794178794178, "grad_norm": 0.13079050183296204, "learning_rate": 1.6683991683991685e-05, "loss": 0.3958, "num_input_tokens_seen": 244736, "step": 1285 }, { "epoch": 0.6704781704781705, "grad_norm": 0.5446637868881226, "learning_rate": 1.67489604989605e-05, "loss": 0.0782, "num_input_tokens_seen": 245696, "step": 1290 }, { "epoch": 0.6730769230769231, "grad_norm": 1.5366498231887817, "learning_rate": 1.6813929313929314e-05, "loss": 0.4312, "num_input_tokens_seen": 246720, "step": 1295 }, { "epoch": 0.6756756756756757, "grad_norm": 0.9966176748275757, "learning_rate": 1.687889812889813e-05, "loss": 0.2357, "num_input_tokens_seen": 247744, "step": 1300 }, { "epoch": 0.6782744282744283, "grad_norm": 0.4096314013004303, "learning_rate": 1.6943866943866946e-05, "loss": 0.395, "num_input_tokens_seen": 248736, "step": 1305 }, { "epoch": 0.6808731808731808, "grad_norm": 0.9860207438468933, "learning_rate": 1.7008835758835758e-05, "loss": 0.3376, "num_input_tokens_seen": 249728, "step": 1310 }, { "epoch": 0.6834719334719335, "grad_norm": 3.6733453273773193, "learning_rate": 1.7073804573804574e-05, "loss": 0.4266, "num_input_tokens_seen": 250624, "step": 1315 }, { "epoch": 0.6860706860706861, "grad_norm": 0.7034622430801392, "learning_rate": 1.713877338877339e-05, "loss": 0.1745, "num_input_tokens_seen": 251616, "step": 1320 }, { "epoch": 0.6886694386694386, "grad_norm": 0.6366632580757141, "learning_rate": 1.7203742203742202e-05, "loss": 0.1987, "num_input_tokens_seen": 252480, "step": 1325 }, { "epoch": 0.6912681912681913, "grad_norm": 1.406096339225769, "learning_rate": 1.7268711018711022e-05, "loss": 0.2202, "num_input_tokens_seen": 253440, "step": 1330 }, { "epoch": 0.6938669438669439, "grad_norm": 2.0985891819000244, "learning_rate": 1.7333679833679834e-05, "loss": 0.5486, "num_input_tokens_seen": 254400, "step": 1335 }, { "epoch": 0.6964656964656964, "grad_norm": 1.154427409172058, "learning_rate": 1.739864864864865e-05, "loss": 0.1507, "num_input_tokens_seen": 255424, "step": 1340 }, { "epoch": 0.6990644490644491, "grad_norm": 1.4545764923095703, "learning_rate": 1.7463617463617466e-05, "loss": 0.1084, "num_input_tokens_seen": 256320, "step": 1345 }, { "epoch": 0.7016632016632016, "grad_norm": 0.584995448589325, "learning_rate": 1.752858627858628e-05, "loss": 0.3139, "num_input_tokens_seen": 257216, "step": 1350 }, { "epoch": 0.7042619542619543, "grad_norm": 0.12097394466400146, "learning_rate": 1.7593555093555095e-05, "loss": 0.6216, "num_input_tokens_seen": 258080, "step": 1355 }, { "epoch": 0.7068607068607069, "grad_norm": 3.6830689907073975, "learning_rate": 1.765852390852391e-05, "loss": 0.3345, "num_input_tokens_seen": 259008, "step": 1360 }, { "epoch": 0.7094594594594594, "grad_norm": 0.7442472577095032, "learning_rate": 1.7723492723492723e-05, "loss": 0.1141, "num_input_tokens_seen": 259968, "step": 1365 }, { "epoch": 0.7120582120582121, "grad_norm": 0.9537001848220825, "learning_rate": 1.778846153846154e-05, "loss": 0.1835, "num_input_tokens_seen": 260960, "step": 1370 }, { "epoch": 0.7146569646569647, "grad_norm": 2.4494588375091553, "learning_rate": 1.785343035343035e-05, "loss": 0.5223, "num_input_tokens_seen": 261920, "step": 1375 }, { "epoch": 0.7172557172557172, "grad_norm": 1.224702000617981, "learning_rate": 1.791839916839917e-05, "loss": 0.1814, "num_input_tokens_seen": 262816, "step": 1380 }, { "epoch": 0.7198544698544699, "grad_norm": 0.07958496361970901, "learning_rate": 1.7983367983367987e-05, "loss": 0.5034, "num_input_tokens_seen": 263808, "step": 1385 }, { "epoch": 0.7224532224532224, "grad_norm": 0.5503033399581909, "learning_rate": 1.80483367983368e-05, "loss": 0.1949, "num_input_tokens_seen": 264736, "step": 1390 }, { "epoch": 0.725051975051975, "grad_norm": 0.2517986297607422, "learning_rate": 1.8113305613305615e-05, "loss": 0.2221, "num_input_tokens_seen": 265696, "step": 1395 }, { "epoch": 0.7276507276507277, "grad_norm": 0.9318121075630188, "learning_rate": 1.817827442827443e-05, "loss": 0.4934, "num_input_tokens_seen": 266624, "step": 1400 }, { "epoch": 0.7302494802494802, "grad_norm": 2.75075101852417, "learning_rate": 1.8243243243243244e-05, "loss": 0.4552, "num_input_tokens_seen": 267584, "step": 1405 }, { "epoch": 0.7328482328482329, "grad_norm": 1.6614480018615723, "learning_rate": 1.830821205821206e-05, "loss": 0.2776, "num_input_tokens_seen": 268544, "step": 1410 }, { "epoch": 0.7354469854469855, "grad_norm": 0.09096039831638336, "learning_rate": 1.8373180873180872e-05, "loss": 0.1937, "num_input_tokens_seen": 269472, "step": 1415 }, { "epoch": 0.738045738045738, "grad_norm": 2.975266218185425, "learning_rate": 1.8438149688149688e-05, "loss": 0.222, "num_input_tokens_seen": 270368, "step": 1420 }, { "epoch": 0.7406444906444907, "grad_norm": 3.1170895099639893, "learning_rate": 1.8503118503118504e-05, "loss": 0.3261, "num_input_tokens_seen": 271392, "step": 1425 }, { "epoch": 0.7432432432432432, "grad_norm": 0.21539387106895447, "learning_rate": 1.856808731808732e-05, "loss": 0.1954, "num_input_tokens_seen": 272384, "step": 1430 }, { "epoch": 0.7458419958419958, "grad_norm": 0.7817115783691406, "learning_rate": 1.8633056133056136e-05, "loss": 0.1465, "num_input_tokens_seen": 273280, "step": 1435 }, { "epoch": 0.7484407484407485, "grad_norm": 0.20777982473373413, "learning_rate": 1.8698024948024952e-05, "loss": 0.0706, "num_input_tokens_seen": 274176, "step": 1440 }, { "epoch": 0.751039501039501, "grad_norm": 0.14822502434253693, "learning_rate": 1.8762993762993764e-05, "loss": 0.1589, "num_input_tokens_seen": 275136, "step": 1445 }, { "epoch": 0.7536382536382537, "grad_norm": 0.8027420043945312, "learning_rate": 1.882796257796258e-05, "loss": 0.3379, "num_input_tokens_seen": 276128, "step": 1450 }, { "epoch": 0.7562370062370062, "grad_norm": 0.9055303335189819, "learning_rate": 1.8892931392931393e-05, "loss": 0.2128, "num_input_tokens_seen": 277024, "step": 1455 }, { "epoch": 0.7588357588357588, "grad_norm": 0.7533237338066101, "learning_rate": 1.895790020790021e-05, "loss": 0.253, "num_input_tokens_seen": 278048, "step": 1460 }, { "epoch": 0.7614345114345115, "grad_norm": 0.7418221831321716, "learning_rate": 1.9022869022869025e-05, "loss": 0.1789, "num_input_tokens_seen": 279040, "step": 1465 }, { "epoch": 0.764033264033264, "grad_norm": 0.6504396796226501, "learning_rate": 1.9087837837837837e-05, "loss": 0.2861, "num_input_tokens_seen": 280000, "step": 1470 }, { "epoch": 0.7666320166320166, "grad_norm": 0.0661110207438469, "learning_rate": 1.9152806652806653e-05, "loss": 0.0348, "num_input_tokens_seen": 280960, "step": 1475 }, { "epoch": 0.7692307692307693, "grad_norm": 1.0457404851913452, "learning_rate": 1.921777546777547e-05, "loss": 0.2644, "num_input_tokens_seen": 281920, "step": 1480 }, { "epoch": 0.7718295218295218, "grad_norm": 0.08680402487516403, "learning_rate": 1.9282744282744285e-05, "loss": 0.176, "num_input_tokens_seen": 282944, "step": 1485 }, { "epoch": 0.7744282744282744, "grad_norm": 0.2503196597099304, "learning_rate": 1.93477130977131e-05, "loss": 0.2196, "num_input_tokens_seen": 283872, "step": 1490 }, { "epoch": 0.777027027027027, "grad_norm": 2.159553050994873, "learning_rate": 1.9412681912681913e-05, "loss": 0.3736, "num_input_tokens_seen": 284800, "step": 1495 }, { "epoch": 0.7796257796257796, "grad_norm": 2.995479106903076, "learning_rate": 1.947765072765073e-05, "loss": 0.4256, "num_input_tokens_seen": 285664, "step": 1500 }, { "epoch": 0.7822245322245323, "grad_norm": 0.07008275389671326, "learning_rate": 1.9542619542619545e-05, "loss": 0.0563, "num_input_tokens_seen": 286624, "step": 1505 }, { "epoch": 0.7848232848232848, "grad_norm": 0.09709735959768295, "learning_rate": 1.9607588357588358e-05, "loss": 0.1873, "num_input_tokens_seen": 287520, "step": 1510 }, { "epoch": 0.7874220374220374, "grad_norm": 1.0485605001449585, "learning_rate": 1.9672557172557174e-05, "loss": 0.1794, "num_input_tokens_seen": 288512, "step": 1515 }, { "epoch": 0.7900207900207901, "grad_norm": 1.1704856157302856, "learning_rate": 1.9737525987525986e-05, "loss": 0.3425, "num_input_tokens_seen": 289472, "step": 1520 }, { "epoch": 0.7926195426195426, "grad_norm": 0.1950603425502777, "learning_rate": 1.9802494802494802e-05, "loss": 0.1134, "num_input_tokens_seen": 290400, "step": 1525 }, { "epoch": 0.7952182952182952, "grad_norm": 0.7074883580207825, "learning_rate": 1.9867463617463618e-05, "loss": 0.1463, "num_input_tokens_seen": 291360, "step": 1530 }, { "epoch": 0.7978170478170478, "grad_norm": 0.6497138738632202, "learning_rate": 1.9932432432432434e-05, "loss": 0.2242, "num_input_tokens_seen": 292384, "step": 1535 }, { "epoch": 0.8004158004158004, "grad_norm": 1.3157864809036255, "learning_rate": 1.999740124740125e-05, "loss": 0.3079, "num_input_tokens_seen": 293472, "step": 1540 }, { "epoch": 0.803014553014553, "grad_norm": 2.4523379802703857, "learning_rate": 2.0062370062370066e-05, "loss": 0.2144, "num_input_tokens_seen": 294432, "step": 1545 }, { "epoch": 0.8056133056133056, "grad_norm": 1.6181089878082275, "learning_rate": 2.0127338877338878e-05, "loss": 0.4301, "num_input_tokens_seen": 295456, "step": 1550 }, { "epoch": 0.8082120582120582, "grad_norm": 0.10125811398029327, "learning_rate": 2.0192307692307694e-05, "loss": 0.1254, "num_input_tokens_seen": 296448, "step": 1555 }, { "epoch": 0.8108108108108109, "grad_norm": 0.5824875235557556, "learning_rate": 2.0257276507276507e-05, "loss": 0.073, "num_input_tokens_seen": 297376, "step": 1560 }, { "epoch": 0.8134095634095634, "grad_norm": 0.9866721034049988, "learning_rate": 2.0322245322245323e-05, "loss": 0.1712, "num_input_tokens_seen": 298336, "step": 1565 }, { "epoch": 0.816008316008316, "grad_norm": 0.15785899758338928, "learning_rate": 2.038721413721414e-05, "loss": 0.0883, "num_input_tokens_seen": 299264, "step": 1570 }, { "epoch": 0.8186070686070686, "grad_norm": 0.9991189241409302, "learning_rate": 2.045218295218295e-05, "loss": 0.1459, "num_input_tokens_seen": 300256, "step": 1575 }, { "epoch": 0.8212058212058212, "grad_norm": 0.15325213968753815, "learning_rate": 2.0517151767151767e-05, "loss": 0.0455, "num_input_tokens_seen": 301184, "step": 1580 }, { "epoch": 0.8238045738045738, "grad_norm": 0.21533022820949554, "learning_rate": 2.0582120582120583e-05, "loss": 0.1223, "num_input_tokens_seen": 302144, "step": 1585 }, { "epoch": 0.8264033264033264, "grad_norm": 0.11757231503725052, "learning_rate": 2.06470893970894e-05, "loss": 0.2562, "num_input_tokens_seen": 303136, "step": 1590 }, { "epoch": 0.829002079002079, "grad_norm": 2.6264874935150146, "learning_rate": 2.0712058212058215e-05, "loss": 0.2903, "num_input_tokens_seen": 304096, "step": 1595 }, { "epoch": 0.8316008316008316, "grad_norm": 0.13825589418411255, "learning_rate": 2.0777027027027027e-05, "loss": 0.1175, "num_input_tokens_seen": 304992, "step": 1600 }, { "epoch": 0.8341995841995842, "grad_norm": 1.6936854124069214, "learning_rate": 2.0841995841995843e-05, "loss": 0.441, "num_input_tokens_seen": 305984, "step": 1605 }, { "epoch": 0.8367983367983368, "grad_norm": 0.3554334044456482, "learning_rate": 2.090696465696466e-05, "loss": 0.2397, "num_input_tokens_seen": 307008, "step": 1610 }, { "epoch": 0.8393970893970893, "grad_norm": 1.315146565437317, "learning_rate": 2.097193347193347e-05, "loss": 0.1155, "num_input_tokens_seen": 307936, "step": 1615 }, { "epoch": 0.841995841995842, "grad_norm": 0.20094899833202362, "learning_rate": 2.1036902286902288e-05, "loss": 0.1308, "num_input_tokens_seen": 308896, "step": 1620 }, { "epoch": 0.8445945945945946, "grad_norm": 0.3709370195865631, "learning_rate": 2.1101871101871104e-05, "loss": 0.3235, "num_input_tokens_seen": 309824, "step": 1625 }, { "epoch": 0.8471933471933472, "grad_norm": 0.9913859963417053, "learning_rate": 2.1166839916839916e-05, "loss": 0.2889, "num_input_tokens_seen": 310880, "step": 1630 }, { "epoch": 0.8497920997920998, "grad_norm": 1.4265779256820679, "learning_rate": 2.1231808731808732e-05, "loss": 0.37, "num_input_tokens_seen": 311840, "step": 1635 }, { "epoch": 0.8523908523908524, "grad_norm": 1.1239291429519653, "learning_rate": 2.1296777546777548e-05, "loss": 0.093, "num_input_tokens_seen": 312800, "step": 1640 }, { "epoch": 0.854989604989605, "grad_norm": 1.0735615491867065, "learning_rate": 2.1361746361746364e-05, "loss": 0.2817, "num_input_tokens_seen": 313792, "step": 1645 }, { "epoch": 0.8575883575883576, "grad_norm": 0.18603956699371338, "learning_rate": 2.142671517671518e-05, "loss": 0.1116, "num_input_tokens_seen": 314720, "step": 1650 }, { "epoch": 0.8601871101871101, "grad_norm": 0.9223487973213196, "learning_rate": 2.1491683991683992e-05, "loss": 0.1472, "num_input_tokens_seen": 315648, "step": 1655 }, { "epoch": 0.8627858627858628, "grad_norm": 0.19823147356510162, "learning_rate": 2.1556652806652808e-05, "loss": 0.0935, "num_input_tokens_seen": 316608, "step": 1660 }, { "epoch": 0.8653846153846154, "grad_norm": 1.5868207216262817, "learning_rate": 2.1621621621621624e-05, "loss": 0.3108, "num_input_tokens_seen": 317600, "step": 1665 }, { "epoch": 0.867983367983368, "grad_norm": 1.4694221019744873, "learning_rate": 2.1686590436590437e-05, "loss": 0.5784, "num_input_tokens_seen": 318560, "step": 1670 }, { "epoch": 0.8705821205821206, "grad_norm": 1.4232169389724731, "learning_rate": 2.1751559251559253e-05, "loss": 0.1561, "num_input_tokens_seen": 319456, "step": 1675 }, { "epoch": 0.8731808731808732, "grad_norm": 1.3299767971038818, "learning_rate": 2.1816528066528065e-05, "loss": 0.2432, "num_input_tokens_seen": 320384, "step": 1680 }, { "epoch": 0.8757796257796258, "grad_norm": 0.9820790886878967, "learning_rate": 2.188149688149688e-05, "loss": 0.2376, "num_input_tokens_seen": 321344, "step": 1685 }, { "epoch": 0.8783783783783784, "grad_norm": 0.7607834935188293, "learning_rate": 2.1946465696465697e-05, "loss": 0.1183, "num_input_tokens_seen": 322336, "step": 1690 }, { "epoch": 0.8809771309771309, "grad_norm": 0.5367822051048279, "learning_rate": 2.2011434511434513e-05, "loss": 0.1023, "num_input_tokens_seen": 323360, "step": 1695 }, { "epoch": 0.8835758835758836, "grad_norm": 0.15195117890834808, "learning_rate": 2.207640332640333e-05, "loss": 0.1696, "num_input_tokens_seen": 324256, "step": 1700 }, { "epoch": 0.8861746361746362, "grad_norm": 0.4723656177520752, "learning_rate": 2.2141372141372145e-05, "loss": 0.3673, "num_input_tokens_seen": 325152, "step": 1705 }, { "epoch": 0.8887733887733887, "grad_norm": 1.7655649185180664, "learning_rate": 2.2206340956340957e-05, "loss": 0.2393, "num_input_tokens_seen": 326048, "step": 1710 }, { "epoch": 0.8913721413721414, "grad_norm": 0.5686731934547424, "learning_rate": 2.2271309771309773e-05, "loss": 0.1194, "num_input_tokens_seen": 327008, "step": 1715 }, { "epoch": 0.893970893970894, "grad_norm": 0.35533004999160767, "learning_rate": 2.2336278586278586e-05, "loss": 0.083, "num_input_tokens_seen": 327936, "step": 1720 }, { "epoch": 0.8965696465696466, "grad_norm": 0.14377924799919128, "learning_rate": 2.24012474012474e-05, "loss": 0.3519, "num_input_tokens_seen": 328896, "step": 1725 }, { "epoch": 0.8991683991683992, "grad_norm": 1.1883138418197632, "learning_rate": 2.2466216216216218e-05, "loss": 0.1444, "num_input_tokens_seen": 329888, "step": 1730 }, { "epoch": 0.9017671517671517, "grad_norm": 0.5534085631370544, "learning_rate": 2.253118503118503e-05, "loss": 0.1719, "num_input_tokens_seen": 330848, "step": 1735 }, { "epoch": 0.9043659043659044, "grad_norm": 0.15371856093406677, "learning_rate": 2.2596153846153846e-05, "loss": 0.2148, "num_input_tokens_seen": 331744, "step": 1740 }, { "epoch": 0.906964656964657, "grad_norm": 0.5932520627975464, "learning_rate": 2.2661122661122662e-05, "loss": 0.2771, "num_input_tokens_seen": 332736, "step": 1745 }, { "epoch": 0.9095634095634095, "grad_norm": 1.6100196838378906, "learning_rate": 2.2726091476091478e-05, "loss": 0.1562, "num_input_tokens_seen": 333728, "step": 1750 }, { "epoch": 0.9121621621621622, "grad_norm": 0.3626706600189209, "learning_rate": 2.2791060291060294e-05, "loss": 0.0393, "num_input_tokens_seen": 334624, "step": 1755 }, { "epoch": 0.9147609147609148, "grad_norm": 2.063145875930786, "learning_rate": 2.2856029106029106e-05, "loss": 0.1265, "num_input_tokens_seen": 335616, "step": 1760 }, { "epoch": 0.9173596673596673, "grad_norm": 1.3594722747802734, "learning_rate": 2.2920997920997922e-05, "loss": 0.2743, "num_input_tokens_seen": 336544, "step": 1765 }, { "epoch": 0.91995841995842, "grad_norm": 2.8264129161834717, "learning_rate": 2.2985966735966738e-05, "loss": 0.2137, "num_input_tokens_seen": 337472, "step": 1770 }, { "epoch": 0.9225571725571725, "grad_norm": 0.8470602035522461, "learning_rate": 2.305093555093555e-05, "loss": 0.3174, "num_input_tokens_seen": 338400, "step": 1775 }, { "epoch": 0.9251559251559252, "grad_norm": 0.12242826074361801, "learning_rate": 2.3115904365904367e-05, "loss": 0.1148, "num_input_tokens_seen": 339360, "step": 1780 }, { "epoch": 0.9277546777546778, "grad_norm": 0.213448628783226, "learning_rate": 2.3180873180873183e-05, "loss": 0.2128, "num_input_tokens_seen": 340288, "step": 1785 }, { "epoch": 0.9303534303534303, "grad_norm": 1.2412558794021606, "learning_rate": 2.3245841995841995e-05, "loss": 0.2603, "num_input_tokens_seen": 341312, "step": 1790 }, { "epoch": 0.932952182952183, "grad_norm": 1.2546956539154053, "learning_rate": 2.331081081081081e-05, "loss": 0.351, "num_input_tokens_seen": 342272, "step": 1795 }, { "epoch": 0.9355509355509356, "grad_norm": 2.3287665843963623, "learning_rate": 2.3375779625779627e-05, "loss": 0.3326, "num_input_tokens_seen": 343136, "step": 1800 }, { "epoch": 0.9381496881496881, "grad_norm": 1.542664647102356, "learning_rate": 2.3440748440748443e-05, "loss": 0.2573, "num_input_tokens_seen": 344064, "step": 1805 }, { "epoch": 0.9407484407484408, "grad_norm": 1.9704622030258179, "learning_rate": 2.350571725571726e-05, "loss": 0.1404, "num_input_tokens_seen": 345024, "step": 1810 }, { "epoch": 0.9433471933471933, "grad_norm": 0.8796544671058655, "learning_rate": 2.357068607068607e-05, "loss": 0.2378, "num_input_tokens_seen": 346016, "step": 1815 }, { "epoch": 0.9459459459459459, "grad_norm": 0.506488561630249, "learning_rate": 2.3635654885654887e-05, "loss": 0.3027, "num_input_tokens_seen": 347008, "step": 1820 }, { "epoch": 0.9485446985446986, "grad_norm": 1.3513238430023193, "learning_rate": 2.3700623700623703e-05, "loss": 0.1569, "num_input_tokens_seen": 347936, "step": 1825 }, { "epoch": 0.9511434511434511, "grad_norm": 0.21586909890174866, "learning_rate": 2.3765592515592516e-05, "loss": 0.185, "num_input_tokens_seen": 348896, "step": 1830 }, { "epoch": 0.9537422037422038, "grad_norm": 0.4196820557117462, "learning_rate": 2.383056133056133e-05, "loss": 0.2441, "num_input_tokens_seen": 349824, "step": 1835 }, { "epoch": 0.9563409563409564, "grad_norm": 0.9162739515304565, "learning_rate": 2.3895530145530144e-05, "loss": 0.1772, "num_input_tokens_seen": 350752, "step": 1840 }, { "epoch": 0.9589397089397089, "grad_norm": 1.3677270412445068, "learning_rate": 2.396049896049896e-05, "loss": 0.1803, "num_input_tokens_seen": 351648, "step": 1845 }, { "epoch": 0.9615384615384616, "grad_norm": 0.905929684638977, "learning_rate": 2.4025467775467776e-05, "loss": 0.1999, "num_input_tokens_seen": 352608, "step": 1850 }, { "epoch": 0.9641372141372141, "grad_norm": 1.1008516550064087, "learning_rate": 2.4090436590436592e-05, "loss": 0.2177, "num_input_tokens_seen": 353632, "step": 1855 }, { "epoch": 0.9667359667359667, "grad_norm": 0.26981645822525024, "learning_rate": 2.4155405405405408e-05, "loss": 0.0825, "num_input_tokens_seen": 354592, "step": 1860 }, { "epoch": 0.9693347193347194, "grad_norm": 0.6669535636901855, "learning_rate": 2.4220374220374224e-05, "loss": 0.288, "num_input_tokens_seen": 355456, "step": 1865 }, { "epoch": 0.9719334719334719, "grad_norm": 0.18466733396053314, "learning_rate": 2.4285343035343036e-05, "loss": 0.1211, "num_input_tokens_seen": 356448, "step": 1870 }, { "epoch": 0.9745322245322245, "grad_norm": 0.11702802032232285, "learning_rate": 2.4350311850311852e-05, "loss": 0.3246, "num_input_tokens_seen": 357408, "step": 1875 }, { "epoch": 0.9771309771309772, "grad_norm": 0.704026997089386, "learning_rate": 2.4415280665280665e-05, "loss": 0.1601, "num_input_tokens_seen": 358368, "step": 1880 }, { "epoch": 0.9797297297297297, "grad_norm": 0.9781737923622131, "learning_rate": 2.448024948024948e-05, "loss": 0.2995, "num_input_tokens_seen": 359296, "step": 1885 }, { "epoch": 0.9823284823284824, "grad_norm": 0.9245156049728394, "learning_rate": 2.4545218295218297e-05, "loss": 0.1981, "num_input_tokens_seen": 360256, "step": 1890 }, { "epoch": 0.9849272349272349, "grad_norm": 0.7397998571395874, "learning_rate": 2.461018711018711e-05, "loss": 0.107, "num_input_tokens_seen": 361216, "step": 1895 }, { "epoch": 0.9875259875259875, "grad_norm": 0.5420805215835571, "learning_rate": 2.4675155925155925e-05, "loss": 0.2286, "num_input_tokens_seen": 362176, "step": 1900 }, { "epoch": 0.9901247401247402, "grad_norm": 1.8924368619918823, "learning_rate": 2.4740124740124744e-05, "loss": 0.3144, "num_input_tokens_seen": 363200, "step": 1905 }, { "epoch": 0.9927234927234927, "grad_norm": 0.36027291417121887, "learning_rate": 2.4805093555093557e-05, "loss": 0.0685, "num_input_tokens_seen": 364160, "step": 1910 }, { "epoch": 0.9953222453222453, "grad_norm": 0.25841450691223145, "learning_rate": 2.4870062370062373e-05, "loss": 0.2603, "num_input_tokens_seen": 365024, "step": 1915 }, { "epoch": 0.997920997920998, "grad_norm": 1.185859203338623, "learning_rate": 2.4935031185031185e-05, "loss": 0.343, "num_input_tokens_seen": 366016, "step": 1920 }, { "epoch": 1.0, "eval_loss": 0.20758913457393646, "eval_runtime": 8.0186, "eval_samples_per_second": 106.752, "eval_steps_per_second": 26.688, "num_input_tokens_seen": 366712, "step": 1924 }, { "epoch": 1.0005197505197505, "grad_norm": 2.3431880474090576, "learning_rate": 2.5e-05, "loss": 0.3316, "num_input_tokens_seen": 366904, "step": 1925 }, { "epoch": 1.003118503118503, "grad_norm": 0.485740602016449, "learning_rate": 2.5064968814968814e-05, "loss": 0.2137, "num_input_tokens_seen": 367832, "step": 1930 }, { "epoch": 1.0057172557172558, "grad_norm": 0.4665715992450714, "learning_rate": 2.5129937629937633e-05, "loss": 0.1894, "num_input_tokens_seen": 368824, "step": 1935 }, { "epoch": 1.0083160083160083, "grad_norm": 2.3916540145874023, "learning_rate": 2.5194906444906446e-05, "loss": 0.4569, "num_input_tokens_seen": 369816, "step": 1940 }, { "epoch": 1.0109147609147608, "grad_norm": 0.8519537448883057, "learning_rate": 2.5259875259875258e-05, "loss": 0.2239, "num_input_tokens_seen": 370776, "step": 1945 }, { "epoch": 1.0135135135135136, "grad_norm": 1.132051944732666, "learning_rate": 2.5324844074844077e-05, "loss": 0.3816, "num_input_tokens_seen": 371704, "step": 1950 }, { "epoch": 1.0161122661122661, "grad_norm": 0.9918363690376282, "learning_rate": 2.5389812889812893e-05, "loss": 0.1371, "num_input_tokens_seen": 372664, "step": 1955 }, { "epoch": 1.0187110187110187, "grad_norm": 0.09257154166698456, "learning_rate": 2.5454781704781706e-05, "loss": 0.0964, "num_input_tokens_seen": 373592, "step": 1960 }, { "epoch": 1.0213097713097714, "grad_norm": 0.2240888774394989, "learning_rate": 2.551975051975052e-05, "loss": 0.062, "num_input_tokens_seen": 374488, "step": 1965 }, { "epoch": 1.023908523908524, "grad_norm": 1.6504062414169312, "learning_rate": 2.5584719334719338e-05, "loss": 0.2651, "num_input_tokens_seen": 375416, "step": 1970 }, { "epoch": 1.0265072765072765, "grad_norm": 0.9172537326812744, "learning_rate": 2.564968814968815e-05, "loss": 0.2854, "num_input_tokens_seen": 376312, "step": 1975 }, { "epoch": 1.0291060291060292, "grad_norm": 0.4100039303302765, "learning_rate": 2.5714656964656963e-05, "loss": 0.1664, "num_input_tokens_seen": 377176, "step": 1980 }, { "epoch": 1.0317047817047817, "grad_norm": 0.7018710970878601, "learning_rate": 2.5779625779625782e-05, "loss": 0.4459, "num_input_tokens_seen": 378168, "step": 1985 }, { "epoch": 1.0343035343035343, "grad_norm": 0.3468957245349884, "learning_rate": 2.5844594594594595e-05, "loss": 0.1544, "num_input_tokens_seen": 379160, "step": 1990 }, { "epoch": 1.0369022869022868, "grad_norm": 0.4788932800292969, "learning_rate": 2.5909563409563407e-05, "loss": 0.2092, "num_input_tokens_seen": 380120, "step": 1995 }, { "epoch": 1.0395010395010396, "grad_norm": 0.5564035177230835, "learning_rate": 2.5974532224532226e-05, "loss": 0.1268, "num_input_tokens_seen": 381016, "step": 2000 }, { "epoch": 1.042099792099792, "grad_norm": 1.4886488914489746, "learning_rate": 2.6039501039501042e-05, "loss": 0.1887, "num_input_tokens_seen": 381976, "step": 2005 }, { "epoch": 1.0446985446985446, "grad_norm": 1.4620540142059326, "learning_rate": 2.6104469854469855e-05, "loss": 0.1368, "num_input_tokens_seen": 382904, "step": 2010 }, { "epoch": 1.0472972972972974, "grad_norm": 0.8965896964073181, "learning_rate": 2.6169438669438674e-05, "loss": 0.1859, "num_input_tokens_seen": 383800, "step": 2015 }, { "epoch": 1.04989604989605, "grad_norm": 0.43941497802734375, "learning_rate": 2.6234407484407487e-05, "loss": 0.1542, "num_input_tokens_seen": 384824, "step": 2020 }, { "epoch": 1.0524948024948024, "grad_norm": 1.0409797430038452, "learning_rate": 2.62993762993763e-05, "loss": 0.2286, "num_input_tokens_seen": 385880, "step": 2025 }, { "epoch": 1.0550935550935552, "grad_norm": 1.6884667873382568, "learning_rate": 2.636434511434512e-05, "loss": 0.1203, "num_input_tokens_seen": 386808, "step": 2030 }, { "epoch": 1.0576923076923077, "grad_norm": 2.45735239982605, "learning_rate": 2.642931392931393e-05, "loss": 0.2516, "num_input_tokens_seen": 387768, "step": 2035 }, { "epoch": 1.0602910602910602, "grad_norm": 0.8365748524665833, "learning_rate": 2.6494282744282744e-05, "loss": 0.1572, "num_input_tokens_seen": 388728, "step": 2040 }, { "epoch": 1.062889812889813, "grad_norm": 0.6046952605247498, "learning_rate": 2.6559251559251556e-05, "loss": 0.1476, "num_input_tokens_seen": 389688, "step": 2045 }, { "epoch": 1.0654885654885655, "grad_norm": 1.5006012916564941, "learning_rate": 2.6624220374220376e-05, "loss": 0.2174, "num_input_tokens_seen": 390680, "step": 2050 }, { "epoch": 1.068087318087318, "grad_norm": 0.4198724329471588, "learning_rate": 2.668918918918919e-05, "loss": 0.129, "num_input_tokens_seen": 391640, "step": 2055 }, { "epoch": 1.0706860706860706, "grad_norm": 0.6880924701690674, "learning_rate": 2.6754158004158004e-05, "loss": 0.3181, "num_input_tokens_seen": 392568, "step": 2060 }, { "epoch": 1.0732848232848233, "grad_norm": 0.2993570566177368, "learning_rate": 2.6819126819126823e-05, "loss": 0.2575, "num_input_tokens_seen": 393560, "step": 2065 }, { "epoch": 1.0758835758835759, "grad_norm": 0.3575250506401062, "learning_rate": 2.6884095634095636e-05, "loss": 0.1012, "num_input_tokens_seen": 394552, "step": 2070 }, { "epoch": 1.0784823284823284, "grad_norm": 0.5611479878425598, "learning_rate": 2.694906444906445e-05, "loss": 0.1706, "num_input_tokens_seen": 395512, "step": 2075 }, { "epoch": 1.0810810810810811, "grad_norm": 1.9095865488052368, "learning_rate": 2.7014033264033268e-05, "loss": 0.2944, "num_input_tokens_seen": 396504, "step": 2080 }, { "epoch": 1.0836798336798337, "grad_norm": 0.3011113703250885, "learning_rate": 2.707900207900208e-05, "loss": 0.2104, "num_input_tokens_seen": 397496, "step": 2085 }, { "epoch": 1.0862785862785862, "grad_norm": 3.2896175384521484, "learning_rate": 2.7143970893970893e-05, "loss": 0.2691, "num_input_tokens_seen": 398456, "step": 2090 }, { "epoch": 1.088877338877339, "grad_norm": 1.4479539394378662, "learning_rate": 2.7208939708939712e-05, "loss": 0.1482, "num_input_tokens_seen": 399448, "step": 2095 }, { "epoch": 1.0914760914760915, "grad_norm": 1.3972220420837402, "learning_rate": 2.7273908523908525e-05, "loss": 0.155, "num_input_tokens_seen": 400408, "step": 2100 }, { "epoch": 1.094074844074844, "grad_norm": 0.5521454811096191, "learning_rate": 2.733887733887734e-05, "loss": 0.1629, "num_input_tokens_seen": 401336, "step": 2105 }, { "epoch": 1.0966735966735968, "grad_norm": 1.5686215162277222, "learning_rate": 2.7403846153846156e-05, "loss": 0.2835, "num_input_tokens_seen": 402296, "step": 2110 }, { "epoch": 1.0992723492723493, "grad_norm": 0.8757802248001099, "learning_rate": 2.7468814968814972e-05, "loss": 0.1374, "num_input_tokens_seen": 403224, "step": 2115 }, { "epoch": 1.1018711018711018, "grad_norm": 0.2621021866798401, "learning_rate": 2.7533783783783785e-05, "loss": 0.0828, "num_input_tokens_seen": 404184, "step": 2120 }, { "epoch": 1.1044698544698546, "grad_norm": 0.3316408693790436, "learning_rate": 2.7598752598752597e-05, "loss": 0.2792, "num_input_tokens_seen": 405176, "step": 2125 }, { "epoch": 1.107068607068607, "grad_norm": 2.088355541229248, "learning_rate": 2.7663721413721417e-05, "loss": 0.511, "num_input_tokens_seen": 406104, "step": 2130 }, { "epoch": 1.1096673596673596, "grad_norm": 0.5740347504615784, "learning_rate": 2.772869022869023e-05, "loss": 0.1423, "num_input_tokens_seen": 407000, "step": 2135 }, { "epoch": 1.1122661122661124, "grad_norm": 2.010173797607422, "learning_rate": 2.7793659043659042e-05, "loss": 0.2758, "num_input_tokens_seen": 407928, "step": 2140 }, { "epoch": 1.114864864864865, "grad_norm": 0.38649091124534607, "learning_rate": 2.785862785862786e-05, "loss": 0.1177, "num_input_tokens_seen": 408920, "step": 2145 }, { "epoch": 1.1174636174636174, "grad_norm": 0.3549553453922272, "learning_rate": 2.7923596673596674e-05, "loss": 0.0967, "num_input_tokens_seen": 409848, "step": 2150 }, { "epoch": 1.12006237006237, "grad_norm": 1.31662917137146, "learning_rate": 2.798856548856549e-05, "loss": 0.2376, "num_input_tokens_seen": 410840, "step": 2155 }, { "epoch": 1.1226611226611227, "grad_norm": 0.5595337152481079, "learning_rate": 2.8053534303534305e-05, "loss": 0.3632, "num_input_tokens_seen": 411896, "step": 2160 }, { "epoch": 1.1252598752598753, "grad_norm": 0.8313397765159607, "learning_rate": 2.811850311850312e-05, "loss": 0.1466, "num_input_tokens_seen": 412824, "step": 2165 }, { "epoch": 1.1278586278586278, "grad_norm": 1.5682761669158936, "learning_rate": 2.8183471933471934e-05, "loss": 0.2057, "num_input_tokens_seen": 413720, "step": 2170 }, { "epoch": 1.1304573804573805, "grad_norm": 1.2474368810653687, "learning_rate": 2.8248440748440753e-05, "loss": 0.1671, "num_input_tokens_seen": 414648, "step": 2175 }, { "epoch": 1.133056133056133, "grad_norm": 1.1850862503051758, "learning_rate": 2.8313409563409566e-05, "loss": 0.1924, "num_input_tokens_seen": 415640, "step": 2180 }, { "epoch": 1.1356548856548856, "grad_norm": 0.5866402983665466, "learning_rate": 2.8378378378378378e-05, "loss": 0.1293, "num_input_tokens_seen": 416536, "step": 2185 }, { "epoch": 1.1382536382536383, "grad_norm": 1.9956343173980713, "learning_rate": 2.8443347193347198e-05, "loss": 0.3597, "num_input_tokens_seen": 417496, "step": 2190 }, { "epoch": 1.1408523908523909, "grad_norm": 0.6641936898231506, "learning_rate": 2.850831600831601e-05, "loss": 0.2175, "num_input_tokens_seen": 418424, "step": 2195 }, { "epoch": 1.1434511434511434, "grad_norm": 0.5231421589851379, "learning_rate": 2.8573284823284823e-05, "loss": 0.1807, "num_input_tokens_seen": 419384, "step": 2200 }, { "epoch": 1.1460498960498962, "grad_norm": 1.9358230829238892, "learning_rate": 2.863825363825364e-05, "loss": 0.2365, "num_input_tokens_seen": 420312, "step": 2205 }, { "epoch": 1.1486486486486487, "grad_norm": 0.5495506525039673, "learning_rate": 2.8703222453222454e-05, "loss": 0.1634, "num_input_tokens_seen": 421272, "step": 2210 }, { "epoch": 1.1512474012474012, "grad_norm": 0.9518699645996094, "learning_rate": 2.876819126819127e-05, "loss": 0.1931, "num_input_tokens_seen": 422136, "step": 2215 }, { "epoch": 1.1538461538461537, "grad_norm": 1.5423742532730103, "learning_rate": 2.8833160083160083e-05, "loss": 0.386, "num_input_tokens_seen": 423128, "step": 2220 }, { "epoch": 1.1564449064449065, "grad_norm": 1.7811686992645264, "learning_rate": 2.8898128898128902e-05, "loss": 0.2176, "num_input_tokens_seen": 424056, "step": 2225 }, { "epoch": 1.159043659043659, "grad_norm": 0.8137246370315552, "learning_rate": 2.8963097713097715e-05, "loss": 0.1986, "num_input_tokens_seen": 424984, "step": 2230 }, { "epoch": 1.1616424116424116, "grad_norm": 0.40253186225891113, "learning_rate": 2.9028066528066527e-05, "loss": 0.1209, "num_input_tokens_seen": 425848, "step": 2235 }, { "epoch": 1.1642411642411643, "grad_norm": 0.7811002135276794, "learning_rate": 2.9093035343035347e-05, "loss": 0.0959, "num_input_tokens_seen": 426744, "step": 2240 }, { "epoch": 1.1668399168399168, "grad_norm": 0.932898223400116, "learning_rate": 2.915800415800416e-05, "loss": 0.155, "num_input_tokens_seen": 427768, "step": 2245 }, { "epoch": 1.1694386694386694, "grad_norm": 0.34944120049476624, "learning_rate": 2.9222972972972972e-05, "loss": 0.1385, "num_input_tokens_seen": 428728, "step": 2250 }, { "epoch": 1.1720374220374221, "grad_norm": 0.9132587313652039, "learning_rate": 2.928794178794179e-05, "loss": 0.394, "num_input_tokens_seen": 429688, "step": 2255 }, { "epoch": 1.1746361746361746, "grad_norm": 0.23328201472759247, "learning_rate": 2.9352910602910604e-05, "loss": 0.1499, "num_input_tokens_seen": 430584, "step": 2260 }, { "epoch": 1.1772349272349272, "grad_norm": 0.5350027084350586, "learning_rate": 2.941787941787942e-05, "loss": 0.2001, "num_input_tokens_seen": 431480, "step": 2265 }, { "epoch": 1.17983367983368, "grad_norm": 0.9017285108566284, "learning_rate": 2.9482848232848235e-05, "loss": 0.2155, "num_input_tokens_seen": 432376, "step": 2270 }, { "epoch": 1.1824324324324325, "grad_norm": 1.8673570156097412, "learning_rate": 2.954781704781705e-05, "loss": 0.2759, "num_input_tokens_seen": 433368, "step": 2275 }, { "epoch": 1.185031185031185, "grad_norm": 0.2596736252307892, "learning_rate": 2.9612785862785864e-05, "loss": 0.1055, "num_input_tokens_seen": 434232, "step": 2280 }, { "epoch": 1.1876299376299375, "grad_norm": 0.39246857166290283, "learning_rate": 2.9677754677754676e-05, "loss": 0.0767, "num_input_tokens_seen": 435096, "step": 2285 }, { "epoch": 1.1902286902286903, "grad_norm": 0.664448082447052, "learning_rate": 2.9742723492723496e-05, "loss": 0.3065, "num_input_tokens_seen": 436120, "step": 2290 }, { "epoch": 1.1928274428274428, "grad_norm": 1.5335336923599243, "learning_rate": 2.9807692307692308e-05, "loss": 0.2779, "num_input_tokens_seen": 437048, "step": 2295 }, { "epoch": 1.1954261954261955, "grad_norm": 0.22690549492835999, "learning_rate": 2.987266112266112e-05, "loss": 0.1127, "num_input_tokens_seen": 438072, "step": 2300 }, { "epoch": 1.198024948024948, "grad_norm": 0.7117283344268799, "learning_rate": 2.993762993762994e-05, "loss": 0.1757, "num_input_tokens_seen": 439000, "step": 2305 }, { "epoch": 1.2006237006237006, "grad_norm": 0.5913382172584534, "learning_rate": 3.0002598752598753e-05, "loss": 0.1858, "num_input_tokens_seen": 439960, "step": 2310 }, { "epoch": 1.2032224532224531, "grad_norm": 0.2042332887649536, "learning_rate": 3.006756756756757e-05, "loss": 0.134, "num_input_tokens_seen": 440984, "step": 2315 }, { "epoch": 1.2058212058212059, "grad_norm": 0.37872427701950073, "learning_rate": 3.0132536382536384e-05, "loss": 0.0544, "num_input_tokens_seen": 441944, "step": 2320 }, { "epoch": 1.2084199584199584, "grad_norm": 0.28442442417144775, "learning_rate": 3.01975051975052e-05, "loss": 0.1142, "num_input_tokens_seen": 442872, "step": 2325 }, { "epoch": 1.211018711018711, "grad_norm": 0.33879661560058594, "learning_rate": 3.0262474012474013e-05, "loss": 0.2324, "num_input_tokens_seen": 443832, "step": 2330 }, { "epoch": 1.2136174636174637, "grad_norm": 0.41121014952659607, "learning_rate": 3.0327442827442832e-05, "loss": 0.2445, "num_input_tokens_seen": 444856, "step": 2335 }, { "epoch": 1.2162162162162162, "grad_norm": 0.2356598675251007, "learning_rate": 3.0392411642411645e-05, "loss": 0.0883, "num_input_tokens_seen": 445784, "step": 2340 }, { "epoch": 1.2188149688149688, "grad_norm": 1.428971290588379, "learning_rate": 3.0457380457380457e-05, "loss": 0.2415, "num_input_tokens_seen": 446648, "step": 2345 }, { "epoch": 1.2214137214137215, "grad_norm": 0.2180185467004776, "learning_rate": 3.052234927234927e-05, "loss": 0.132, "num_input_tokens_seen": 447576, "step": 2350 }, { "epoch": 1.224012474012474, "grad_norm": 0.1283380091190338, "learning_rate": 3.058731808731809e-05, "loss": 0.2007, "num_input_tokens_seen": 448504, "step": 2355 }, { "epoch": 1.2266112266112266, "grad_norm": 0.7079935073852539, "learning_rate": 3.0652286902286905e-05, "loss": 0.1141, "num_input_tokens_seen": 449400, "step": 2360 }, { "epoch": 1.2292099792099793, "grad_norm": 0.04105569049715996, "learning_rate": 3.071725571725572e-05, "loss": 0.08, "num_input_tokens_seen": 450264, "step": 2365 }, { "epoch": 1.2318087318087318, "grad_norm": 0.10510033369064331, "learning_rate": 3.078222453222454e-05, "loss": 0.1556, "num_input_tokens_seen": 451128, "step": 2370 }, { "epoch": 1.2344074844074844, "grad_norm": 0.575109601020813, "learning_rate": 3.084719334719335e-05, "loss": 0.164, "num_input_tokens_seen": 452088, "step": 2375 }, { "epoch": 1.237006237006237, "grad_norm": 0.9635463356971741, "learning_rate": 3.091216216216216e-05, "loss": 0.3088, "num_input_tokens_seen": 453048, "step": 2380 }, { "epoch": 1.2396049896049897, "grad_norm": 0.22256158292293549, "learning_rate": 3.097713097713098e-05, "loss": 0.1291, "num_input_tokens_seen": 454008, "step": 2385 }, { "epoch": 1.2422037422037422, "grad_norm": 1.8968786001205444, "learning_rate": 3.1042099792099794e-05, "loss": 0.2678, "num_input_tokens_seen": 454968, "step": 2390 }, { "epoch": 1.2448024948024947, "grad_norm": 0.7072094678878784, "learning_rate": 3.1107068607068606e-05, "loss": 0.1959, "num_input_tokens_seen": 455960, "step": 2395 }, { "epoch": 1.2474012474012475, "grad_norm": 0.4461927115917206, "learning_rate": 3.1172037422037426e-05, "loss": 0.1018, "num_input_tokens_seen": 456856, "step": 2400 }, { "epoch": 1.25, "grad_norm": 0.34878742694854736, "learning_rate": 3.123700623700624e-05, "loss": 0.1072, "num_input_tokens_seen": 457880, "step": 2405 }, { "epoch": 1.2525987525987525, "grad_norm": 0.5555586814880371, "learning_rate": 3.130197505197505e-05, "loss": 0.2015, "num_input_tokens_seen": 458840, "step": 2410 }, { "epoch": 1.255197505197505, "grad_norm": 0.32150229811668396, "learning_rate": 3.136694386694387e-05, "loss": 0.2329, "num_input_tokens_seen": 459800, "step": 2415 }, { "epoch": 1.2577962577962578, "grad_norm": 0.39119163155555725, "learning_rate": 3.143191268191268e-05, "loss": 0.3794, "num_input_tokens_seen": 460792, "step": 2420 }, { "epoch": 1.2603950103950103, "grad_norm": 0.7400370836257935, "learning_rate": 3.1496881496881495e-05, "loss": 0.1889, "num_input_tokens_seen": 461752, "step": 2425 }, { "epoch": 1.262993762993763, "grad_norm": 0.35445278882980347, "learning_rate": 3.156185031185031e-05, "loss": 0.1686, "num_input_tokens_seen": 462712, "step": 2430 }, { "epoch": 1.2655925155925156, "grad_norm": 1.208441972732544, "learning_rate": 3.162681912681913e-05, "loss": 0.1763, "num_input_tokens_seen": 463608, "step": 2435 }, { "epoch": 1.2681912681912682, "grad_norm": 0.9041829705238342, "learning_rate": 3.169178794178794e-05, "loss": 0.223, "num_input_tokens_seen": 464568, "step": 2440 }, { "epoch": 1.2707900207900207, "grad_norm": 0.20370253920555115, "learning_rate": 3.175675675675676e-05, "loss": 0.1598, "num_input_tokens_seen": 465496, "step": 2445 }, { "epoch": 1.2733887733887734, "grad_norm": 0.2634558081626892, "learning_rate": 3.182172557172558e-05, "loss": 0.2091, "num_input_tokens_seen": 466360, "step": 2450 }, { "epoch": 1.275987525987526, "grad_norm": 0.2332993447780609, "learning_rate": 3.188669438669439e-05, "loss": 0.1593, "num_input_tokens_seen": 467320, "step": 2455 }, { "epoch": 1.2785862785862787, "grad_norm": 0.5173628926277161, "learning_rate": 3.19516632016632e-05, "loss": 0.1317, "num_input_tokens_seen": 468184, "step": 2460 }, { "epoch": 1.2811850311850312, "grad_norm": 0.254893034696579, "learning_rate": 3.201663201663202e-05, "loss": 0.1011, "num_input_tokens_seen": 469176, "step": 2465 }, { "epoch": 1.2837837837837838, "grad_norm": 1.4806662797927856, "learning_rate": 3.2081600831600835e-05, "loss": 0.1739, "num_input_tokens_seen": 470136, "step": 2470 }, { "epoch": 1.2863825363825363, "grad_norm": 1.0846095085144043, "learning_rate": 3.214656964656965e-05, "loss": 0.1554, "num_input_tokens_seen": 471096, "step": 2475 }, { "epoch": 1.288981288981289, "grad_norm": 0.5368154048919678, "learning_rate": 3.221153846153847e-05, "loss": 0.1733, "num_input_tokens_seen": 472120, "step": 2480 }, { "epoch": 1.2915800415800416, "grad_norm": 0.46473628282546997, "learning_rate": 3.227650727650728e-05, "loss": 0.1919, "num_input_tokens_seen": 473048, "step": 2485 }, { "epoch": 1.2941787941787941, "grad_norm": 0.43947234749794006, "learning_rate": 3.234147609147609e-05, "loss": 0.1762, "num_input_tokens_seen": 473976, "step": 2490 }, { "epoch": 1.2967775467775469, "grad_norm": 0.7189226150512695, "learning_rate": 3.240644490644491e-05, "loss": 0.1075, "num_input_tokens_seen": 474936, "step": 2495 }, { "epoch": 1.2993762993762994, "grad_norm": 0.5927683711051941, "learning_rate": 3.2471413721413724e-05, "loss": 0.2827, "num_input_tokens_seen": 475896, "step": 2500 }, { "epoch": 1.301975051975052, "grad_norm": 0.5206925868988037, "learning_rate": 3.2536382536382536e-05, "loss": 0.1829, "num_input_tokens_seen": 476920, "step": 2505 }, { "epoch": 1.3045738045738045, "grad_norm": 1.178350806236267, "learning_rate": 3.260135135135135e-05, "loss": 0.1367, "num_input_tokens_seen": 477848, "step": 2510 }, { "epoch": 1.3071725571725572, "grad_norm": 0.22139212489128113, "learning_rate": 3.266632016632017e-05, "loss": 0.1097, "num_input_tokens_seen": 478808, "step": 2515 }, { "epoch": 1.3097713097713097, "grad_norm": 0.5122672915458679, "learning_rate": 3.273128898128898e-05, "loss": 0.2247, "num_input_tokens_seen": 479832, "step": 2520 }, { "epoch": 1.3123700623700625, "grad_norm": 0.3930198550224304, "learning_rate": 3.279625779625779e-05, "loss": 0.137, "num_input_tokens_seen": 480792, "step": 2525 }, { "epoch": 1.314968814968815, "grad_norm": 0.7105388641357422, "learning_rate": 3.286122661122661e-05, "loss": 0.139, "num_input_tokens_seen": 481752, "step": 2530 }, { "epoch": 1.3175675675675675, "grad_norm": 0.7606893181800842, "learning_rate": 3.2926195426195425e-05, "loss": 0.1114, "num_input_tokens_seen": 482680, "step": 2535 }, { "epoch": 1.32016632016632, "grad_norm": 0.8946520090103149, "learning_rate": 3.299116424116424e-05, "loss": 0.2997, "num_input_tokens_seen": 483704, "step": 2540 }, { "epoch": 1.3227650727650728, "grad_norm": 0.4684136211872101, "learning_rate": 3.305613305613306e-05, "loss": 0.2179, "num_input_tokens_seen": 484568, "step": 2545 }, { "epoch": 1.3253638253638254, "grad_norm": 0.17600588500499725, "learning_rate": 3.3121101871101876e-05, "loss": 0.1805, "num_input_tokens_seen": 485464, "step": 2550 }, { "epoch": 1.3279625779625779, "grad_norm": 0.9113808870315552, "learning_rate": 3.318607068607069e-05, "loss": 0.0839, "num_input_tokens_seen": 486328, "step": 2555 }, { "epoch": 1.3305613305613306, "grad_norm": 0.9235140085220337, "learning_rate": 3.325103950103951e-05, "loss": 0.147, "num_input_tokens_seen": 487288, "step": 2560 }, { "epoch": 1.3331600831600832, "grad_norm": 0.5700381398200989, "learning_rate": 3.331600831600832e-05, "loss": 0.2524, "num_input_tokens_seen": 488248, "step": 2565 }, { "epoch": 1.3357588357588357, "grad_norm": 0.5612170696258545, "learning_rate": 3.338097713097713e-05, "loss": 0.0662, "num_input_tokens_seen": 489272, "step": 2570 }, { "epoch": 1.3383575883575882, "grad_norm": 0.1886385679244995, "learning_rate": 3.344594594594595e-05, "loss": 0.2313, "num_input_tokens_seen": 490200, "step": 2575 }, { "epoch": 1.340956340956341, "grad_norm": 0.7573165893554688, "learning_rate": 3.3510914760914765e-05, "loss": 0.0763, "num_input_tokens_seen": 491192, "step": 2580 }, { "epoch": 1.3435550935550935, "grad_norm": 1.038520336151123, "learning_rate": 3.357588357588358e-05, "loss": 0.1764, "num_input_tokens_seen": 492056, "step": 2585 }, { "epoch": 1.3461538461538463, "grad_norm": 1.0367251634597778, "learning_rate": 3.364085239085239e-05, "loss": 0.1944, "num_input_tokens_seen": 493016, "step": 2590 }, { "epoch": 1.3487525987525988, "grad_norm": 0.9327481389045715, "learning_rate": 3.370582120582121e-05, "loss": 0.2794, "num_input_tokens_seen": 493976, "step": 2595 }, { "epoch": 1.3513513513513513, "grad_norm": 0.5052903890609741, "learning_rate": 3.377079002079002e-05, "loss": 0.1547, "num_input_tokens_seen": 494968, "step": 2600 }, { "epoch": 1.3539501039501038, "grad_norm": 0.853803813457489, "learning_rate": 3.3835758835758834e-05, "loss": 0.1816, "num_input_tokens_seen": 495992, "step": 2605 }, { "epoch": 1.3565488565488566, "grad_norm": 0.22105838358402252, "learning_rate": 3.3900727650727654e-05, "loss": 0.1248, "num_input_tokens_seen": 496888, "step": 2610 }, { "epoch": 1.3591476091476091, "grad_norm": 0.5844555497169495, "learning_rate": 3.3965696465696466e-05, "loss": 0.1654, "num_input_tokens_seen": 497880, "step": 2615 }, { "epoch": 1.3617463617463619, "grad_norm": 0.49510860443115234, "learning_rate": 3.403066528066528e-05, "loss": 0.2228, "num_input_tokens_seen": 498840, "step": 2620 }, { "epoch": 1.3643451143451144, "grad_norm": 0.1256479024887085, "learning_rate": 3.40956340956341e-05, "loss": 0.2011, "num_input_tokens_seen": 499736, "step": 2625 }, { "epoch": 1.366943866943867, "grad_norm": 0.45603829622268677, "learning_rate": 3.416060291060291e-05, "loss": 0.1937, "num_input_tokens_seen": 500728, "step": 2630 }, { "epoch": 1.3695426195426195, "grad_norm": 0.4112269878387451, "learning_rate": 3.422557172557172e-05, "loss": 0.1916, "num_input_tokens_seen": 501720, "step": 2635 }, { "epoch": 1.3721413721413722, "grad_norm": 0.48502543568611145, "learning_rate": 3.429054054054054e-05, "loss": 0.1698, "num_input_tokens_seen": 502648, "step": 2640 }, { "epoch": 1.3747401247401247, "grad_norm": 0.715972363948822, "learning_rate": 3.4355509355509355e-05, "loss": 0.2102, "num_input_tokens_seen": 503544, "step": 2645 }, { "epoch": 1.3773388773388773, "grad_norm": 1.1742384433746338, "learning_rate": 3.4420478170478174e-05, "loss": 0.2696, "num_input_tokens_seen": 504504, "step": 2650 }, { "epoch": 1.37993762993763, "grad_norm": 1.417620301246643, "learning_rate": 3.448544698544699e-05, "loss": 0.141, "num_input_tokens_seen": 505464, "step": 2655 }, { "epoch": 1.3825363825363826, "grad_norm": 0.27950331568717957, "learning_rate": 3.4550415800415806e-05, "loss": 0.1345, "num_input_tokens_seen": 506424, "step": 2660 }, { "epoch": 1.385135135135135, "grad_norm": 0.3545871675014496, "learning_rate": 3.461538461538462e-05, "loss": 0.19, "num_input_tokens_seen": 507448, "step": 2665 }, { "epoch": 1.3877338877338876, "grad_norm": 2.7442545890808105, "learning_rate": 3.468035343035343e-05, "loss": 0.3424, "num_input_tokens_seen": 508536, "step": 2670 }, { "epoch": 1.3903326403326404, "grad_norm": 2.339291572570801, "learning_rate": 3.474532224532225e-05, "loss": 0.3924, "num_input_tokens_seen": 509528, "step": 2675 }, { "epoch": 1.392931392931393, "grad_norm": 0.923659086227417, "learning_rate": 3.481029106029106e-05, "loss": 0.1823, "num_input_tokens_seen": 510488, "step": 2680 }, { "epoch": 1.3955301455301456, "grad_norm": 0.6033694744110107, "learning_rate": 3.4875259875259876e-05, "loss": 0.1698, "num_input_tokens_seen": 511544, "step": 2685 }, { "epoch": 1.3981288981288982, "grad_norm": 0.7577350735664368, "learning_rate": 3.4940228690228695e-05, "loss": 0.2331, "num_input_tokens_seen": 512568, "step": 2690 }, { "epoch": 1.4007276507276507, "grad_norm": 0.6107421517372131, "learning_rate": 3.500519750519751e-05, "loss": 0.2492, "num_input_tokens_seen": 513464, "step": 2695 }, { "epoch": 1.4033264033264032, "grad_norm": 1.0242588520050049, "learning_rate": 3.507016632016632e-05, "loss": 0.2291, "num_input_tokens_seen": 514424, "step": 2700 }, { "epoch": 1.405925155925156, "grad_norm": 0.17791293561458588, "learning_rate": 3.513513513513514e-05, "loss": 0.1477, "num_input_tokens_seen": 515384, "step": 2705 }, { "epoch": 1.4085239085239085, "grad_norm": 0.17211943864822388, "learning_rate": 3.520010395010395e-05, "loss": 0.1068, "num_input_tokens_seen": 516280, "step": 2710 }, { "epoch": 1.411122661122661, "grad_norm": 0.46098607778549194, "learning_rate": 3.5265072765072764e-05, "loss": 0.2063, "num_input_tokens_seen": 517240, "step": 2715 }, { "epoch": 1.4137214137214138, "grad_norm": 0.4999559819698334, "learning_rate": 3.5330041580041584e-05, "loss": 0.2925, "num_input_tokens_seen": 518200, "step": 2720 }, { "epoch": 1.4163201663201663, "grad_norm": 0.7819861769676208, "learning_rate": 3.5395010395010396e-05, "loss": 0.1354, "num_input_tokens_seen": 519256, "step": 2725 }, { "epoch": 1.4189189189189189, "grad_norm": 0.7062827348709106, "learning_rate": 3.545997920997921e-05, "loss": 0.1843, "num_input_tokens_seen": 520280, "step": 2730 }, { "epoch": 1.4215176715176714, "grad_norm": 0.9656638503074646, "learning_rate": 3.552494802494803e-05, "loss": 0.1592, "num_input_tokens_seen": 521240, "step": 2735 }, { "epoch": 1.4241164241164241, "grad_norm": 0.8266677856445312, "learning_rate": 3.558991683991684e-05, "loss": 0.1006, "num_input_tokens_seen": 522200, "step": 2740 }, { "epoch": 1.4267151767151767, "grad_norm": 0.9675162434577942, "learning_rate": 3.565488565488565e-05, "loss": 0.219, "num_input_tokens_seen": 523128, "step": 2745 }, { "epoch": 1.4293139293139294, "grad_norm": 0.816233217716217, "learning_rate": 3.571985446985447e-05, "loss": 0.212, "num_input_tokens_seen": 524184, "step": 2750 }, { "epoch": 1.431912681912682, "grad_norm": 0.3019169569015503, "learning_rate": 3.5784823284823285e-05, "loss": 0.2317, "num_input_tokens_seen": 525080, "step": 2755 }, { "epoch": 1.4345114345114345, "grad_norm": 0.536730945110321, "learning_rate": 3.5849792099792104e-05, "loss": 0.3174, "num_input_tokens_seen": 526008, "step": 2760 }, { "epoch": 1.437110187110187, "grad_norm": 0.2228458821773529, "learning_rate": 3.591476091476092e-05, "loss": 0.123, "num_input_tokens_seen": 526936, "step": 2765 }, { "epoch": 1.4397089397089398, "grad_norm": 0.3189751207828522, "learning_rate": 3.5979729729729736e-05, "loss": 0.1622, "num_input_tokens_seen": 527896, "step": 2770 }, { "epoch": 1.4423076923076923, "grad_norm": 0.33862313628196716, "learning_rate": 3.604469854469855e-05, "loss": 0.1275, "num_input_tokens_seen": 528920, "step": 2775 }, { "epoch": 1.444906444906445, "grad_norm": 0.39699962735176086, "learning_rate": 3.610966735966736e-05, "loss": 0.2572, "num_input_tokens_seen": 529880, "step": 2780 }, { "epoch": 1.4475051975051976, "grad_norm": 0.16917100548744202, "learning_rate": 3.617463617463618e-05, "loss": 0.1508, "num_input_tokens_seen": 530840, "step": 2785 }, { "epoch": 1.45010395010395, "grad_norm": 0.4169575572013855, "learning_rate": 3.623960498960499e-05, "loss": 0.167, "num_input_tokens_seen": 531704, "step": 2790 }, { "epoch": 1.4527027027027026, "grad_norm": 0.5056340098381042, "learning_rate": 3.6304573804573805e-05, "loss": 0.1578, "num_input_tokens_seen": 532728, "step": 2795 }, { "epoch": 1.4553014553014554, "grad_norm": 0.9285807609558105, "learning_rate": 3.6369542619542625e-05, "loss": 0.2335, "num_input_tokens_seen": 533688, "step": 2800 }, { "epoch": 1.457900207900208, "grad_norm": 0.2718590199947357, "learning_rate": 3.643451143451144e-05, "loss": 0.1343, "num_input_tokens_seen": 534648, "step": 2805 }, { "epoch": 1.4604989604989604, "grad_norm": 0.8362157344818115, "learning_rate": 3.649948024948025e-05, "loss": 0.1617, "num_input_tokens_seen": 535672, "step": 2810 }, { "epoch": 1.4630977130977132, "grad_norm": 0.38106489181518555, "learning_rate": 3.656444906444907e-05, "loss": 0.2311, "num_input_tokens_seen": 536664, "step": 2815 }, { "epoch": 1.4656964656964657, "grad_norm": 1.132148265838623, "learning_rate": 3.662941787941788e-05, "loss": 0.125, "num_input_tokens_seen": 537656, "step": 2820 }, { "epoch": 1.4682952182952183, "grad_norm": 0.36933737993240356, "learning_rate": 3.6694386694386694e-05, "loss": 0.212, "num_input_tokens_seen": 538616, "step": 2825 }, { "epoch": 1.4708939708939708, "grad_norm": 0.8510437607765198, "learning_rate": 3.675935550935551e-05, "loss": 0.2259, "num_input_tokens_seen": 539544, "step": 2830 }, { "epoch": 1.4734927234927235, "grad_norm": 0.9769805669784546, "learning_rate": 3.6824324324324326e-05, "loss": 0.2519, "num_input_tokens_seen": 540504, "step": 2835 }, { "epoch": 1.476091476091476, "grad_norm": 0.5259256958961487, "learning_rate": 3.688929313929314e-05, "loss": 0.1991, "num_input_tokens_seen": 541528, "step": 2840 }, { "epoch": 1.4786902286902288, "grad_norm": 1.118443489074707, "learning_rate": 3.695426195426195e-05, "loss": 0.1942, "num_input_tokens_seen": 542520, "step": 2845 }, { "epoch": 1.4812889812889813, "grad_norm": 0.3749624192714691, "learning_rate": 3.701923076923077e-05, "loss": 0.0934, "num_input_tokens_seen": 543512, "step": 2850 }, { "epoch": 1.4838877338877339, "grad_norm": 0.45317980647087097, "learning_rate": 3.708419958419958e-05, "loss": 0.1819, "num_input_tokens_seen": 544472, "step": 2855 }, { "epoch": 1.4864864864864864, "grad_norm": 0.32480570673942566, "learning_rate": 3.71491683991684e-05, "loss": 0.1522, "num_input_tokens_seen": 545400, "step": 2860 }, { "epoch": 1.4890852390852392, "grad_norm": 0.412252813577652, "learning_rate": 3.7214137214137215e-05, "loss": 0.1477, "num_input_tokens_seen": 546360, "step": 2865 }, { "epoch": 1.4916839916839917, "grad_norm": 0.5901240706443787, "learning_rate": 3.7279106029106034e-05, "loss": 0.1709, "num_input_tokens_seen": 547352, "step": 2870 }, { "epoch": 1.4942827442827442, "grad_norm": 0.9259294271469116, "learning_rate": 3.734407484407485e-05, "loss": 0.1486, "num_input_tokens_seen": 548280, "step": 2875 }, { "epoch": 1.496881496881497, "grad_norm": 0.7888729572296143, "learning_rate": 3.7409043659043666e-05, "loss": 0.1254, "num_input_tokens_seen": 549240, "step": 2880 }, { "epoch": 1.4994802494802495, "grad_norm": 0.45642027258872986, "learning_rate": 3.747401247401248e-05, "loss": 0.1628, "num_input_tokens_seen": 550168, "step": 2885 }, { "epoch": 1.502079002079002, "grad_norm": 0.11955709755420685, "learning_rate": 3.753898128898129e-05, "loss": 0.0909, "num_input_tokens_seen": 551160, "step": 2890 }, { "epoch": 1.5046777546777546, "grad_norm": 0.4711052179336548, "learning_rate": 3.7603950103950104e-05, "loss": 0.1268, "num_input_tokens_seen": 552152, "step": 2895 }, { "epoch": 1.5072765072765073, "grad_norm": 0.3409505784511566, "learning_rate": 3.766891891891892e-05, "loss": 0.175, "num_input_tokens_seen": 553176, "step": 2900 }, { "epoch": 1.5098752598752598, "grad_norm": 0.9875774383544922, "learning_rate": 3.7733887733887735e-05, "loss": 0.2658, "num_input_tokens_seen": 554232, "step": 2905 }, { "epoch": 1.5124740124740126, "grad_norm": 1.4352970123291016, "learning_rate": 3.779885654885655e-05, "loss": 0.1931, "num_input_tokens_seen": 555192, "step": 2910 }, { "epoch": 1.5150727650727651, "grad_norm": 0.28798675537109375, "learning_rate": 3.786382536382537e-05, "loss": 0.1477, "num_input_tokens_seen": 556120, "step": 2915 }, { "epoch": 1.5176715176715176, "grad_norm": 0.6897301077842712, "learning_rate": 3.792879417879418e-05, "loss": 0.1374, "num_input_tokens_seen": 557112, "step": 2920 }, { "epoch": 1.5202702702702702, "grad_norm": 1.1417486667633057, "learning_rate": 3.799376299376299e-05, "loss": 0.1598, "num_input_tokens_seen": 558040, "step": 2925 }, { "epoch": 1.5228690228690227, "grad_norm": 1.2394835948944092, "learning_rate": 3.805873180873181e-05, "loss": 0.2904, "num_input_tokens_seen": 558968, "step": 2930 }, { "epoch": 1.5254677754677755, "grad_norm": 0.6588528752326965, "learning_rate": 3.8123700623700624e-05, "loss": 0.1285, "num_input_tokens_seen": 559928, "step": 2935 }, { "epoch": 1.5280665280665282, "grad_norm": 1.319061517715454, "learning_rate": 3.818866943866944e-05, "loss": 0.1998, "num_input_tokens_seen": 560888, "step": 2940 }, { "epoch": 1.5306652806652807, "grad_norm": 1.2174345254898071, "learning_rate": 3.8253638253638256e-05, "loss": 0.2189, "num_input_tokens_seen": 561880, "step": 2945 }, { "epoch": 1.5332640332640333, "grad_norm": 0.2360023856163025, "learning_rate": 3.831860706860707e-05, "loss": 0.1301, "num_input_tokens_seen": 562776, "step": 2950 }, { "epoch": 1.5358627858627858, "grad_norm": 0.38428032398223877, "learning_rate": 3.838357588357588e-05, "loss": 0.1321, "num_input_tokens_seen": 563672, "step": 2955 }, { "epoch": 1.5384615384615383, "grad_norm": 1.2793809175491333, "learning_rate": 3.84485446985447e-05, "loss": 0.2231, "num_input_tokens_seen": 564568, "step": 2960 }, { "epoch": 1.541060291060291, "grad_norm": 0.4097152054309845, "learning_rate": 3.851351351351351e-05, "loss": 0.1764, "num_input_tokens_seen": 565528, "step": 2965 }, { "epoch": 1.5436590436590436, "grad_norm": 1.13615083694458, "learning_rate": 3.857848232848233e-05, "loss": 0.1263, "num_input_tokens_seen": 566552, "step": 2970 }, { "epoch": 1.5462577962577964, "grad_norm": 1.09328031539917, "learning_rate": 3.8643451143451145e-05, "loss": 0.1608, "num_input_tokens_seen": 567544, "step": 2975 }, { "epoch": 1.5488565488565489, "grad_norm": 0.1554454267024994, "learning_rate": 3.8708419958419964e-05, "loss": 0.196, "num_input_tokens_seen": 568536, "step": 2980 }, { "epoch": 1.5514553014553014, "grad_norm": 0.8923830389976501, "learning_rate": 3.8773388773388777e-05, "loss": 0.2354, "num_input_tokens_seen": 569656, "step": 2985 }, { "epoch": 1.554054054054054, "grad_norm": 0.25250834226608276, "learning_rate": 3.883835758835759e-05, "loss": 0.1658, "num_input_tokens_seen": 570616, "step": 2990 }, { "epoch": 1.5566528066528067, "grad_norm": 0.21509024500846863, "learning_rate": 3.890332640332641e-05, "loss": 0.2102, "num_input_tokens_seen": 571544, "step": 2995 }, { "epoch": 1.5592515592515592, "grad_norm": 0.35704970359802246, "learning_rate": 3.896829521829522e-05, "loss": 0.1005, "num_input_tokens_seen": 572536, "step": 3000 }, { "epoch": 1.561850311850312, "grad_norm": 0.24720415472984314, "learning_rate": 3.9033264033264033e-05, "loss": 0.1627, "num_input_tokens_seen": 573464, "step": 3005 }, { "epoch": 1.5644490644490645, "grad_norm": 0.34256893396377563, "learning_rate": 3.909823284823285e-05, "loss": 0.179, "num_input_tokens_seen": 574520, "step": 3010 }, { "epoch": 1.567047817047817, "grad_norm": 0.3322994112968445, "learning_rate": 3.9163201663201665e-05, "loss": 0.172, "num_input_tokens_seen": 575416, "step": 3015 }, { "epoch": 1.5696465696465696, "grad_norm": 0.2554699182510376, "learning_rate": 3.922817047817048e-05, "loss": 0.1596, "num_input_tokens_seen": 576440, "step": 3020 }, { "epoch": 1.572245322245322, "grad_norm": 0.6605627536773682, "learning_rate": 3.92931392931393e-05, "loss": 0.248, "num_input_tokens_seen": 577464, "step": 3025 }, { "epoch": 1.5748440748440748, "grad_norm": 0.7149372100830078, "learning_rate": 3.935810810810811e-05, "loss": 0.2079, "num_input_tokens_seen": 578360, "step": 3030 }, { "epoch": 1.5774428274428276, "grad_norm": 0.3811071813106537, "learning_rate": 3.942307692307692e-05, "loss": 0.1539, "num_input_tokens_seen": 579384, "step": 3035 }, { "epoch": 1.5800415800415801, "grad_norm": 0.7019319534301758, "learning_rate": 3.948804573804574e-05, "loss": 0.1554, "num_input_tokens_seen": 580344, "step": 3040 }, { "epoch": 1.5826403326403327, "grad_norm": 0.9319161176681519, "learning_rate": 3.9553014553014554e-05, "loss": 0.0799, "num_input_tokens_seen": 581336, "step": 3045 }, { "epoch": 1.5852390852390852, "grad_norm": 0.6652297377586365, "learning_rate": 3.9617983367983367e-05, "loss": 0.221, "num_input_tokens_seen": 582296, "step": 3050 }, { "epoch": 1.5878378378378377, "grad_norm": 0.4724944531917572, "learning_rate": 3.968295218295218e-05, "loss": 0.1381, "num_input_tokens_seen": 583288, "step": 3055 }, { "epoch": 1.5904365904365905, "grad_norm": 0.6586969494819641, "learning_rate": 3.9747920997921e-05, "loss": 0.1672, "num_input_tokens_seen": 584248, "step": 3060 }, { "epoch": 1.593035343035343, "grad_norm": 1.1014283895492554, "learning_rate": 3.981288981288981e-05, "loss": 0.2147, "num_input_tokens_seen": 585208, "step": 3065 }, { "epoch": 1.5956340956340958, "grad_norm": 0.671459436416626, "learning_rate": 3.987785862785863e-05, "loss": 0.1992, "num_input_tokens_seen": 586136, "step": 3070 }, { "epoch": 1.5982328482328483, "grad_norm": 0.35608983039855957, "learning_rate": 3.994282744282745e-05, "loss": 0.1282, "num_input_tokens_seen": 587128, "step": 3075 }, { "epoch": 1.6008316008316008, "grad_norm": 0.6400788426399231, "learning_rate": 4.000779625779626e-05, "loss": 0.1212, "num_input_tokens_seen": 587960, "step": 3080 }, { "epoch": 1.6034303534303533, "grad_norm": 0.22614973783493042, "learning_rate": 4.0072765072765075e-05, "loss": 0.195, "num_input_tokens_seen": 588920, "step": 3085 }, { "epoch": 1.6060291060291059, "grad_norm": 0.5414079427719116, "learning_rate": 4.0137733887733894e-05, "loss": 0.1553, "num_input_tokens_seen": 589912, "step": 3090 }, { "epoch": 1.6086278586278586, "grad_norm": 0.4785781800746918, "learning_rate": 4.0202702702702707e-05, "loss": 0.148, "num_input_tokens_seen": 590872, "step": 3095 }, { "epoch": 1.6112266112266114, "grad_norm": 0.37389224767684937, "learning_rate": 4.026767151767152e-05, "loss": 0.2889, "num_input_tokens_seen": 591768, "step": 3100 }, { "epoch": 1.613825363825364, "grad_norm": 0.36006826162338257, "learning_rate": 4.033264033264034e-05, "loss": 0.1104, "num_input_tokens_seen": 592728, "step": 3105 }, { "epoch": 1.6164241164241164, "grad_norm": 0.6998964548110962, "learning_rate": 4.039760914760915e-05, "loss": 0.127, "num_input_tokens_seen": 593592, "step": 3110 }, { "epoch": 1.619022869022869, "grad_norm": 1.6282966136932373, "learning_rate": 4.0462577962577963e-05, "loss": 0.2389, "num_input_tokens_seen": 594520, "step": 3115 }, { "epoch": 1.6216216216216215, "grad_norm": 0.42650312185287476, "learning_rate": 4.052754677754678e-05, "loss": 0.1851, "num_input_tokens_seen": 595480, "step": 3120 }, { "epoch": 1.6242203742203742, "grad_norm": 0.2474794238805771, "learning_rate": 4.0592515592515595e-05, "loss": 0.2625, "num_input_tokens_seen": 596408, "step": 3125 }, { "epoch": 1.6268191268191268, "grad_norm": 0.3635398745536804, "learning_rate": 4.065748440748441e-05, "loss": 0.1093, "num_input_tokens_seen": 597368, "step": 3130 }, { "epoch": 1.6294178794178795, "grad_norm": 0.3944593667984009, "learning_rate": 4.072245322245322e-05, "loss": 0.0641, "num_input_tokens_seen": 598264, "step": 3135 }, { "epoch": 1.632016632016632, "grad_norm": 0.4416825771331787, "learning_rate": 4.078742203742204e-05, "loss": 0.1505, "num_input_tokens_seen": 599256, "step": 3140 }, { "epoch": 1.6346153846153846, "grad_norm": 1.9619745016098022, "learning_rate": 4.085239085239085e-05, "loss": 0.215, "num_input_tokens_seen": 600184, "step": 3145 }, { "epoch": 1.637214137214137, "grad_norm": 1.5924628973007202, "learning_rate": 4.0917359667359665e-05, "loss": 0.2171, "num_input_tokens_seen": 601112, "step": 3150 }, { "epoch": 1.6398128898128899, "grad_norm": 0.48160141706466675, "learning_rate": 4.0982328482328484e-05, "loss": 0.1492, "num_input_tokens_seen": 602040, "step": 3155 }, { "epoch": 1.6424116424116424, "grad_norm": 0.31696048378944397, "learning_rate": 4.1047297297297297e-05, "loss": 0.0618, "num_input_tokens_seen": 603032, "step": 3160 }, { "epoch": 1.6450103950103951, "grad_norm": 1.132576584815979, "learning_rate": 4.111226611226611e-05, "loss": 0.1456, "num_input_tokens_seen": 603960, "step": 3165 }, { "epoch": 1.6476091476091477, "grad_norm": 0.46007487177848816, "learning_rate": 4.117723492723493e-05, "loss": 0.186, "num_input_tokens_seen": 604952, "step": 3170 }, { "epoch": 1.6502079002079002, "grad_norm": 0.5150517225265503, "learning_rate": 4.124220374220375e-05, "loss": 0.0947, "num_input_tokens_seen": 605944, "step": 3175 }, { "epoch": 1.6528066528066527, "grad_norm": 0.4701213836669922, "learning_rate": 4.130717255717256e-05, "loss": 0.215, "num_input_tokens_seen": 606936, "step": 3180 }, { "epoch": 1.6554054054054053, "grad_norm": 0.9082627296447754, "learning_rate": 4.137214137214138e-05, "loss": 0.2051, "num_input_tokens_seen": 607928, "step": 3185 }, { "epoch": 1.658004158004158, "grad_norm": 0.9372928738594055, "learning_rate": 4.143711018711019e-05, "loss": 0.1179, "num_input_tokens_seen": 608888, "step": 3190 }, { "epoch": 1.6606029106029108, "grad_norm": 0.5978896021842957, "learning_rate": 4.1502079002079005e-05, "loss": 0.1799, "num_input_tokens_seen": 609784, "step": 3195 }, { "epoch": 1.6632016632016633, "grad_norm": 1.0681320428848267, "learning_rate": 4.1567047817047824e-05, "loss": 0.2362, "num_input_tokens_seen": 610712, "step": 3200 }, { "epoch": 1.6658004158004158, "grad_norm": 0.3227379024028778, "learning_rate": 4.1632016632016636e-05, "loss": 0.1359, "num_input_tokens_seen": 611704, "step": 3205 }, { "epoch": 1.6683991683991684, "grad_norm": 0.6597152352333069, "learning_rate": 4.169698544698545e-05, "loss": 0.2137, "num_input_tokens_seen": 612728, "step": 3210 }, { "epoch": 1.6709979209979209, "grad_norm": 0.3024514317512512, "learning_rate": 4.176195426195426e-05, "loss": 0.2103, "num_input_tokens_seen": 613592, "step": 3215 }, { "epoch": 1.6735966735966736, "grad_norm": 0.18272322416305542, "learning_rate": 4.182692307692308e-05, "loss": 0.1186, "num_input_tokens_seen": 614648, "step": 3220 }, { "epoch": 1.6761954261954262, "grad_norm": 0.9568337798118591, "learning_rate": 4.189189189189189e-05, "loss": 0.1503, "num_input_tokens_seen": 615576, "step": 3225 }, { "epoch": 1.678794178794179, "grad_norm": 0.6478314399719238, "learning_rate": 4.1956860706860706e-05, "loss": 0.1595, "num_input_tokens_seen": 616504, "step": 3230 }, { "epoch": 1.6813929313929314, "grad_norm": 0.2908910810947418, "learning_rate": 4.2021829521829525e-05, "loss": 0.2306, "num_input_tokens_seen": 617464, "step": 3235 }, { "epoch": 1.683991683991684, "grad_norm": 0.35413745045661926, "learning_rate": 4.208679833679834e-05, "loss": 0.175, "num_input_tokens_seen": 618424, "step": 3240 }, { "epoch": 1.6865904365904365, "grad_norm": 0.5437381267547607, "learning_rate": 4.215176715176715e-05, "loss": 0.1317, "num_input_tokens_seen": 619352, "step": 3245 }, { "epoch": 1.689189189189189, "grad_norm": 1.7373422384262085, "learning_rate": 4.221673596673597e-05, "loss": 0.2707, "num_input_tokens_seen": 620280, "step": 3250 }, { "epoch": 1.6917879417879418, "grad_norm": 0.36108750104904175, "learning_rate": 4.228170478170478e-05, "loss": 0.1154, "num_input_tokens_seen": 621240, "step": 3255 }, { "epoch": 1.6943866943866945, "grad_norm": 0.5785974264144897, "learning_rate": 4.2346673596673595e-05, "loss": 0.1854, "num_input_tokens_seen": 622168, "step": 3260 }, { "epoch": 1.696985446985447, "grad_norm": 1.2443418502807617, "learning_rate": 4.2411642411642414e-05, "loss": 0.1499, "num_input_tokens_seen": 623128, "step": 3265 }, { "epoch": 1.6995841995841996, "grad_norm": 0.3546992540359497, "learning_rate": 4.2476611226611226e-05, "loss": 0.0578, "num_input_tokens_seen": 624056, "step": 3270 }, { "epoch": 1.7021829521829521, "grad_norm": 0.8531044125556946, "learning_rate": 4.2541580041580046e-05, "loss": 0.1364, "num_input_tokens_seen": 624984, "step": 3275 }, { "epoch": 1.7047817047817047, "grad_norm": 0.20471961796283722, "learning_rate": 4.260654885654886e-05, "loss": 0.1294, "num_input_tokens_seen": 625976, "step": 3280 }, { "epoch": 1.7073804573804574, "grad_norm": 1.0878278017044067, "learning_rate": 4.267151767151768e-05, "loss": 0.2104, "num_input_tokens_seen": 626936, "step": 3285 }, { "epoch": 1.70997920997921, "grad_norm": 0.4464767277240753, "learning_rate": 4.273648648648649e-05, "loss": 0.1579, "num_input_tokens_seen": 627800, "step": 3290 }, { "epoch": 1.7125779625779627, "grad_norm": 0.27632248401641846, "learning_rate": 4.28014553014553e-05, "loss": 0.2985, "num_input_tokens_seen": 628792, "step": 3295 }, { "epoch": 1.7151767151767152, "grad_norm": 0.6648584604263306, "learning_rate": 4.286642411642412e-05, "loss": 0.1996, "num_input_tokens_seen": 629752, "step": 3300 }, { "epoch": 1.7177754677754677, "grad_norm": 0.2988659143447876, "learning_rate": 4.2931392931392935e-05, "loss": 0.096, "num_input_tokens_seen": 630712, "step": 3305 }, { "epoch": 1.7203742203742203, "grad_norm": 0.23992353677749634, "learning_rate": 4.299636174636175e-05, "loss": 0.1266, "num_input_tokens_seen": 631640, "step": 3310 }, { "epoch": 1.722972972972973, "grad_norm": 0.797236442565918, "learning_rate": 4.3061330561330566e-05, "loss": 0.1223, "num_input_tokens_seen": 632568, "step": 3315 }, { "epoch": 1.7255717255717256, "grad_norm": 0.6243069767951965, "learning_rate": 4.312629937629938e-05, "loss": 0.1666, "num_input_tokens_seen": 633464, "step": 3320 }, { "epoch": 1.7281704781704783, "grad_norm": 0.4052852988243103, "learning_rate": 4.319126819126819e-05, "loss": 0.1334, "num_input_tokens_seen": 634424, "step": 3325 }, { "epoch": 1.7307692307692308, "grad_norm": 0.3909969925880432, "learning_rate": 4.325623700623701e-05, "loss": 0.3068, "num_input_tokens_seen": 635416, "step": 3330 }, { "epoch": 1.7333679833679834, "grad_norm": 0.8020495772361755, "learning_rate": 4.332120582120582e-05, "loss": 0.1645, "num_input_tokens_seen": 636312, "step": 3335 }, { "epoch": 1.735966735966736, "grad_norm": 0.25527024269104004, "learning_rate": 4.3386174636174636e-05, "loss": 0.1948, "num_input_tokens_seen": 637240, "step": 3340 }, { "epoch": 1.7385654885654884, "grad_norm": 1.0260816812515259, "learning_rate": 4.3451143451143455e-05, "loss": 0.2643, "num_input_tokens_seen": 638232, "step": 3345 }, { "epoch": 1.7411642411642412, "grad_norm": 0.6057201623916626, "learning_rate": 4.351611226611227e-05, "loss": 0.0766, "num_input_tokens_seen": 639192, "step": 3350 }, { "epoch": 1.743762993762994, "grad_norm": 0.9144442081451416, "learning_rate": 4.358108108108108e-05, "loss": 0.3286, "num_input_tokens_seen": 640184, "step": 3355 }, { "epoch": 1.7463617463617465, "grad_norm": 0.285195529460907, "learning_rate": 4.36460498960499e-05, "loss": 0.2344, "num_input_tokens_seen": 641144, "step": 3360 }, { "epoch": 1.748960498960499, "grad_norm": 0.9442676305770874, "learning_rate": 4.371101871101871e-05, "loss": 0.0971, "num_input_tokens_seen": 642104, "step": 3365 }, { "epoch": 1.7515592515592515, "grad_norm": 0.3037884533405304, "learning_rate": 4.3775987525987525e-05, "loss": 0.1777, "num_input_tokens_seen": 643000, "step": 3370 }, { "epoch": 1.754158004158004, "grad_norm": 0.3623829782009125, "learning_rate": 4.3840956340956344e-05, "loss": 0.1167, "num_input_tokens_seen": 643992, "step": 3375 }, { "epoch": 1.7567567567567568, "grad_norm": 0.30061814188957214, "learning_rate": 4.3905925155925156e-05, "loss": 0.0468, "num_input_tokens_seen": 644984, "step": 3380 }, { "epoch": 1.7593555093555093, "grad_norm": 0.5882213115692139, "learning_rate": 4.3970893970893976e-05, "loss": 0.1818, "num_input_tokens_seen": 645912, "step": 3385 }, { "epoch": 1.761954261954262, "grad_norm": 0.44318580627441406, "learning_rate": 4.403586278586279e-05, "loss": 0.0783, "num_input_tokens_seen": 646968, "step": 3390 }, { "epoch": 1.7645530145530146, "grad_norm": 0.8099141120910645, "learning_rate": 4.410083160083161e-05, "loss": 0.2548, "num_input_tokens_seen": 647864, "step": 3395 }, { "epoch": 1.7671517671517671, "grad_norm": 0.37865573167800903, "learning_rate": 4.416580041580042e-05, "loss": 0.1604, "num_input_tokens_seen": 648856, "step": 3400 }, { "epoch": 1.7697505197505197, "grad_norm": 0.21262158453464508, "learning_rate": 4.423076923076923e-05, "loss": 0.1594, "num_input_tokens_seen": 649816, "step": 3405 }, { "epoch": 1.7723492723492722, "grad_norm": 0.8091773390769958, "learning_rate": 4.429573804573805e-05, "loss": 0.0887, "num_input_tokens_seen": 650712, "step": 3410 }, { "epoch": 1.774948024948025, "grad_norm": 0.5449467897415161, "learning_rate": 4.4360706860706864e-05, "loss": 0.1032, "num_input_tokens_seen": 651704, "step": 3415 }, { "epoch": 1.7775467775467777, "grad_norm": 0.44262275099754333, "learning_rate": 4.442567567567568e-05, "loss": 0.2916, "num_input_tokens_seen": 652664, "step": 3420 }, { "epoch": 1.7801455301455302, "grad_norm": 0.0536322146654129, "learning_rate": 4.4490644490644496e-05, "loss": 0.0997, "num_input_tokens_seen": 653624, "step": 3425 }, { "epoch": 1.7827442827442828, "grad_norm": 1.6946450471878052, "learning_rate": 4.455561330561331e-05, "loss": 0.3327, "num_input_tokens_seen": 654648, "step": 3430 }, { "epoch": 1.7853430353430353, "grad_norm": 1.3891669511795044, "learning_rate": 4.462058212058212e-05, "loss": 0.1219, "num_input_tokens_seen": 655640, "step": 3435 }, { "epoch": 1.7879417879417878, "grad_norm": 0.6441784501075745, "learning_rate": 4.4685550935550934e-05, "loss": 0.1309, "num_input_tokens_seen": 656600, "step": 3440 }, { "epoch": 1.7905405405405406, "grad_norm": 1.1043446063995361, "learning_rate": 4.475051975051975e-05, "loss": 0.1247, "num_input_tokens_seen": 657592, "step": 3445 }, { "epoch": 1.793139293139293, "grad_norm": 1.1029192209243774, "learning_rate": 4.4815488565488566e-05, "loss": 0.2581, "num_input_tokens_seen": 658648, "step": 3450 }, { "epoch": 1.7957380457380459, "grad_norm": 0.6644232273101807, "learning_rate": 4.488045738045738e-05, "loss": 0.1547, "num_input_tokens_seen": 659608, "step": 3455 }, { "epoch": 1.7983367983367984, "grad_norm": 0.2097407877445221, "learning_rate": 4.49454261954262e-05, "loss": 0.1573, "num_input_tokens_seen": 660600, "step": 3460 }, { "epoch": 1.800935550935551, "grad_norm": 0.4402249753475189, "learning_rate": 4.501039501039501e-05, "loss": 0.1395, "num_input_tokens_seen": 661592, "step": 3465 }, { "epoch": 1.8035343035343034, "grad_norm": 0.7114821672439575, "learning_rate": 4.507536382536382e-05, "loss": 0.1032, "num_input_tokens_seen": 662520, "step": 3470 }, { "epoch": 1.806133056133056, "grad_norm": 0.17392316460609436, "learning_rate": 4.514033264033264e-05, "loss": 0.1295, "num_input_tokens_seen": 663576, "step": 3475 }, { "epoch": 1.8087318087318087, "grad_norm": 0.09677068144083023, "learning_rate": 4.5205301455301455e-05, "loss": 0.0923, "num_input_tokens_seen": 664472, "step": 3480 }, { "epoch": 1.8113305613305615, "grad_norm": 0.2667969763278961, "learning_rate": 4.5270270270270274e-05, "loss": 0.1909, "num_input_tokens_seen": 665368, "step": 3485 }, { "epoch": 1.813929313929314, "grad_norm": 0.6867556571960449, "learning_rate": 4.5335239085239086e-05, "loss": 0.1595, "num_input_tokens_seen": 666296, "step": 3490 }, { "epoch": 1.8165280665280665, "grad_norm": 1.2060717344284058, "learning_rate": 4.5400207900207906e-05, "loss": 0.3059, "num_input_tokens_seen": 667320, "step": 3495 }, { "epoch": 1.819126819126819, "grad_norm": 0.30197566747665405, "learning_rate": 4.546517671517672e-05, "loss": 0.2262, "num_input_tokens_seen": 668216, "step": 3500 }, { "epoch": 1.8217255717255716, "grad_norm": 0.6930983066558838, "learning_rate": 4.553014553014554e-05, "loss": 0.2471, "num_input_tokens_seen": 669176, "step": 3505 }, { "epoch": 1.8243243243243243, "grad_norm": 0.7442288994789124, "learning_rate": 4.559511434511435e-05, "loss": 0.1888, "num_input_tokens_seen": 670072, "step": 3510 }, { "epoch": 1.8269230769230769, "grad_norm": 0.8727442026138306, "learning_rate": 4.566008316008316e-05, "loss": 0.1414, "num_input_tokens_seen": 671000, "step": 3515 }, { "epoch": 1.8295218295218296, "grad_norm": 0.6677424907684326, "learning_rate": 4.5725051975051975e-05, "loss": 0.1306, "num_input_tokens_seen": 671960, "step": 3520 }, { "epoch": 1.8321205821205822, "grad_norm": 0.8469691872596741, "learning_rate": 4.5790020790020794e-05, "loss": 0.1904, "num_input_tokens_seen": 672920, "step": 3525 }, { "epoch": 1.8347193347193347, "grad_norm": 0.1419009268283844, "learning_rate": 4.585498960498961e-05, "loss": 0.1439, "num_input_tokens_seen": 673944, "step": 3530 }, { "epoch": 1.8373180873180872, "grad_norm": 0.4462950825691223, "learning_rate": 4.591995841995842e-05, "loss": 0.2083, "num_input_tokens_seen": 674872, "step": 3535 }, { "epoch": 1.83991683991684, "grad_norm": 1.962852120399475, "learning_rate": 4.598492723492724e-05, "loss": 0.2418, "num_input_tokens_seen": 675832, "step": 3540 }, { "epoch": 1.8425155925155925, "grad_norm": 0.6568702459335327, "learning_rate": 4.604989604989605e-05, "loss": 0.1342, "num_input_tokens_seen": 676824, "step": 3545 }, { "epoch": 1.8451143451143452, "grad_norm": 1.0208008289337158, "learning_rate": 4.6114864864864864e-05, "loss": 0.1361, "num_input_tokens_seen": 677784, "step": 3550 }, { "epoch": 1.8477130977130978, "grad_norm": 0.1832171380519867, "learning_rate": 4.617983367983368e-05, "loss": 0.1369, "num_input_tokens_seen": 678616, "step": 3555 }, { "epoch": 1.8503118503118503, "grad_norm": 0.6927157640457153, "learning_rate": 4.6244802494802496e-05, "loss": 0.2059, "num_input_tokens_seen": 679544, "step": 3560 }, { "epoch": 1.8529106029106028, "grad_norm": 0.3881455361843109, "learning_rate": 4.630977130977131e-05, "loss": 0.1969, "num_input_tokens_seen": 680440, "step": 3565 }, { "epoch": 1.8555093555093554, "grad_norm": 0.38154730200767517, "learning_rate": 4.637474012474013e-05, "loss": 0.092, "num_input_tokens_seen": 681464, "step": 3570 }, { "epoch": 1.8581081081081081, "grad_norm": 0.5349447131156921, "learning_rate": 4.643970893970894e-05, "loss": 0.1042, "num_input_tokens_seen": 682328, "step": 3575 }, { "epoch": 1.8607068607068609, "grad_norm": 0.9430475234985352, "learning_rate": 4.650467775467775e-05, "loss": 0.1933, "num_input_tokens_seen": 683288, "step": 3580 }, { "epoch": 1.8633056133056134, "grad_norm": 1.399356484413147, "learning_rate": 4.656964656964657e-05, "loss": 0.1939, "num_input_tokens_seen": 684280, "step": 3585 }, { "epoch": 1.865904365904366, "grad_norm": 1.4113291501998901, "learning_rate": 4.6634615384615384e-05, "loss": 0.2494, "num_input_tokens_seen": 685240, "step": 3590 }, { "epoch": 1.8685031185031185, "grad_norm": 0.7029809951782227, "learning_rate": 4.6699584199584204e-05, "loss": 0.2127, "num_input_tokens_seen": 686168, "step": 3595 }, { "epoch": 1.871101871101871, "grad_norm": 0.6306169629096985, "learning_rate": 4.6764553014553016e-05, "loss": 0.1623, "num_input_tokens_seen": 687096, "step": 3600 }, { "epoch": 1.8737006237006237, "grad_norm": 0.20404285192489624, "learning_rate": 4.6829521829521836e-05, "loss": 0.2293, "num_input_tokens_seen": 688120, "step": 3605 }, { "epoch": 1.8762993762993763, "grad_norm": 0.7490414381027222, "learning_rate": 4.689449064449065e-05, "loss": 0.2277, "num_input_tokens_seen": 689048, "step": 3610 }, { "epoch": 1.878898128898129, "grad_norm": 1.3351726531982422, "learning_rate": 4.695945945945946e-05, "loss": 0.1063, "num_input_tokens_seen": 689944, "step": 3615 }, { "epoch": 1.8814968814968815, "grad_norm": 1.3667960166931152, "learning_rate": 4.702442827442828e-05, "loss": 0.1873, "num_input_tokens_seen": 690808, "step": 3620 }, { "epoch": 1.884095634095634, "grad_norm": 1.1547454595565796, "learning_rate": 4.708939708939709e-05, "loss": 0.1838, "num_input_tokens_seen": 691800, "step": 3625 }, { "epoch": 1.8866943866943866, "grad_norm": 0.24048110842704773, "learning_rate": 4.7154365904365905e-05, "loss": 0.2345, "num_input_tokens_seen": 692760, "step": 3630 }, { "epoch": 1.8892931392931391, "grad_norm": 0.7976765632629395, "learning_rate": 4.7219334719334724e-05, "loss": 0.1509, "num_input_tokens_seen": 693656, "step": 3635 }, { "epoch": 1.8918918918918919, "grad_norm": 0.12960782647132874, "learning_rate": 4.728430353430354e-05, "loss": 0.2283, "num_input_tokens_seen": 694584, "step": 3640 }, { "epoch": 1.8944906444906446, "grad_norm": 0.8957483768463135, "learning_rate": 4.734927234927235e-05, "loss": 0.1731, "num_input_tokens_seen": 695480, "step": 3645 }, { "epoch": 1.8970893970893972, "grad_norm": 0.8831720352172852, "learning_rate": 4.741424116424117e-05, "loss": 0.2995, "num_input_tokens_seen": 696504, "step": 3650 }, { "epoch": 1.8996881496881497, "grad_norm": 1.4849334955215454, "learning_rate": 4.747920997920998e-05, "loss": 0.2189, "num_input_tokens_seen": 697464, "step": 3655 }, { "epoch": 1.9022869022869022, "grad_norm": 0.30606305599212646, "learning_rate": 4.7544178794178794e-05, "loss": 0.1233, "num_input_tokens_seen": 698424, "step": 3660 }, { "epoch": 1.9048856548856548, "grad_norm": 0.8761721253395081, "learning_rate": 4.760914760914761e-05, "loss": 0.2066, "num_input_tokens_seen": 699384, "step": 3665 }, { "epoch": 1.9074844074844075, "grad_norm": 0.7588943243026733, "learning_rate": 4.7674116424116426e-05, "loss": 0.2539, "num_input_tokens_seen": 700344, "step": 3670 }, { "epoch": 1.91008316008316, "grad_norm": 0.14025460183620453, "learning_rate": 4.773908523908524e-05, "loss": 0.1253, "num_input_tokens_seen": 701336, "step": 3675 }, { "epoch": 1.9126819126819128, "grad_norm": 0.26383477449417114, "learning_rate": 4.780405405405405e-05, "loss": 0.1267, "num_input_tokens_seen": 702360, "step": 3680 }, { "epoch": 1.9152806652806653, "grad_norm": 1.0405118465423584, "learning_rate": 4.786902286902287e-05, "loss": 0.1042, "num_input_tokens_seen": 703320, "step": 3685 }, { "epoch": 1.9178794178794178, "grad_norm": 0.36953312158584595, "learning_rate": 4.793399168399168e-05, "loss": 0.1827, "num_input_tokens_seen": 704248, "step": 3690 }, { "epoch": 1.9204781704781704, "grad_norm": 0.39564648270606995, "learning_rate": 4.79989604989605e-05, "loss": 0.0726, "num_input_tokens_seen": 705176, "step": 3695 }, { "epoch": 1.9230769230769231, "grad_norm": 1.5230475664138794, "learning_rate": 4.806392931392932e-05, "loss": 0.2226, "num_input_tokens_seen": 706072, "step": 3700 }, { "epoch": 1.9256756756756757, "grad_norm": 1.2212098836898804, "learning_rate": 4.8128898128898134e-05, "loss": 0.1706, "num_input_tokens_seen": 707032, "step": 3705 }, { "epoch": 1.9282744282744284, "grad_norm": 0.39751559495925903, "learning_rate": 4.8193866943866946e-05, "loss": 0.1502, "num_input_tokens_seen": 707992, "step": 3710 }, { "epoch": 1.930873180873181, "grad_norm": 0.8817111253738403, "learning_rate": 4.8258835758835766e-05, "loss": 0.0981, "num_input_tokens_seen": 708920, "step": 3715 }, { "epoch": 1.9334719334719335, "grad_norm": 0.44028013944625854, "learning_rate": 4.832380457380458e-05, "loss": 0.1942, "num_input_tokens_seen": 709912, "step": 3720 }, { "epoch": 1.936070686070686, "grad_norm": 0.28783485293388367, "learning_rate": 4.838877338877339e-05, "loss": 0.1595, "num_input_tokens_seen": 710904, "step": 3725 }, { "epoch": 1.9386694386694385, "grad_norm": 0.2749890089035034, "learning_rate": 4.845374220374221e-05, "loss": 0.0806, "num_input_tokens_seen": 711832, "step": 3730 }, { "epoch": 1.9412681912681913, "grad_norm": 0.33529195189476013, "learning_rate": 4.851871101871102e-05, "loss": 0.1107, "num_input_tokens_seen": 712696, "step": 3735 }, { "epoch": 1.943866943866944, "grad_norm": 1.0641525983810425, "learning_rate": 4.8583679833679835e-05, "loss": 0.1959, "num_input_tokens_seen": 713624, "step": 3740 }, { "epoch": 1.9464656964656966, "grad_norm": 0.4751814305782318, "learning_rate": 4.8648648648648654e-05, "loss": 0.2358, "num_input_tokens_seen": 714584, "step": 3745 }, { "epoch": 1.949064449064449, "grad_norm": 0.30131274461746216, "learning_rate": 4.871361746361747e-05, "loss": 0.1517, "num_input_tokens_seen": 715544, "step": 3750 }, { "epoch": 1.9516632016632016, "grad_norm": 0.5699447393417358, "learning_rate": 4.877858627858628e-05, "loss": 0.1468, "num_input_tokens_seen": 716568, "step": 3755 }, { "epoch": 1.9542619542619541, "grad_norm": 0.4613027572631836, "learning_rate": 4.884355509355509e-05, "loss": 0.1738, "num_input_tokens_seen": 717528, "step": 3760 }, { "epoch": 1.956860706860707, "grad_norm": 0.8871958255767822, "learning_rate": 4.890852390852391e-05, "loss": 0.1738, "num_input_tokens_seen": 718424, "step": 3765 }, { "epoch": 1.9594594594594594, "grad_norm": 1.1836731433868408, "learning_rate": 4.8973492723492724e-05, "loss": 0.1608, "num_input_tokens_seen": 719384, "step": 3770 }, { "epoch": 1.9620582120582122, "grad_norm": 0.33365553617477417, "learning_rate": 4.9038461538461536e-05, "loss": 0.0956, "num_input_tokens_seen": 720376, "step": 3775 }, { "epoch": 1.9646569646569647, "grad_norm": 1.6057512760162354, "learning_rate": 4.9103430353430356e-05, "loss": 0.1838, "num_input_tokens_seen": 721304, "step": 3780 }, { "epoch": 1.9672557172557172, "grad_norm": 0.5770496726036072, "learning_rate": 4.916839916839917e-05, "loss": 0.1983, "num_input_tokens_seen": 722200, "step": 3785 }, { "epoch": 1.9698544698544698, "grad_norm": 1.1589171886444092, "learning_rate": 4.923336798336798e-05, "loss": 0.2034, "num_input_tokens_seen": 723160, "step": 3790 }, { "epoch": 1.9724532224532223, "grad_norm": 0.42412853240966797, "learning_rate": 4.92983367983368e-05, "loss": 0.1539, "num_input_tokens_seen": 724024, "step": 3795 }, { "epoch": 1.975051975051975, "grad_norm": 0.290584534406662, "learning_rate": 4.936330561330562e-05, "loss": 0.1042, "num_input_tokens_seen": 724984, "step": 3800 }, { "epoch": 1.9776507276507278, "grad_norm": 0.8034878969192505, "learning_rate": 4.942827442827443e-05, "loss": 0.2155, "num_input_tokens_seen": 725944, "step": 3805 }, { "epoch": 1.9802494802494803, "grad_norm": 1.171046495437622, "learning_rate": 4.949324324324325e-05, "loss": 0.1628, "num_input_tokens_seen": 726808, "step": 3810 }, { "epoch": 1.9828482328482329, "grad_norm": 0.38701435923576355, "learning_rate": 4.9558212058212064e-05, "loss": 0.0696, "num_input_tokens_seen": 727672, "step": 3815 }, { "epoch": 1.9854469854469854, "grad_norm": 0.7848663330078125, "learning_rate": 4.9623180873180876e-05, "loss": 0.1124, "num_input_tokens_seen": 728664, "step": 3820 }, { "epoch": 1.988045738045738, "grad_norm": 0.6460594534873962, "learning_rate": 4.9688149688149695e-05, "loss": 0.2994, "num_input_tokens_seen": 729656, "step": 3825 }, { "epoch": 1.9906444906444907, "grad_norm": 0.6499034762382507, "learning_rate": 4.975311850311851e-05, "loss": 0.1314, "num_input_tokens_seen": 730520, "step": 3830 }, { "epoch": 1.9932432432432432, "grad_norm": 1.2955541610717773, "learning_rate": 4.981808731808732e-05, "loss": 0.3028, "num_input_tokens_seen": 731544, "step": 3835 }, { "epoch": 1.995841995841996, "grad_norm": 0.23787708580493927, "learning_rate": 4.988305613305613e-05, "loss": 0.2229, "num_input_tokens_seen": 732472, "step": 3840 }, { "epoch": 1.9984407484407485, "grad_norm": 0.7253393530845642, "learning_rate": 4.994802494802495e-05, "loss": 0.1676, "num_input_tokens_seen": 733400, "step": 3845 }, { "epoch": 2.0, "eval_loss": 0.17000456154346466, "eval_runtime": 8.01, "eval_samples_per_second": 106.866, "eval_steps_per_second": 26.717, "num_input_tokens_seen": 734016, "step": 3848 }, { "epoch": 2.001039501039501, "grad_norm": 0.3305705785751343, "learning_rate": 4.999999989713808e-05, "loss": 0.1044, "num_input_tokens_seen": 734336, "step": 3850 }, { "epoch": 2.0036382536382535, "grad_norm": 0.12137303501367569, "learning_rate": 4.999999629697109e-05, "loss": 0.0991, "num_input_tokens_seen": 735328, "step": 3855 }, { "epoch": 2.006237006237006, "grad_norm": 0.16531318426132202, "learning_rate": 4.9999987553709104e-05, "loss": 0.0916, "num_input_tokens_seen": 736224, "step": 3860 }, { "epoch": 2.008835758835759, "grad_norm": 0.9063527584075928, "learning_rate": 4.999997366735394e-05, "loss": 0.3333, "num_input_tokens_seen": 737184, "step": 3865 }, { "epoch": 2.0114345114345116, "grad_norm": 0.6067707538604736, "learning_rate": 4.999995463790843e-05, "loss": 0.1631, "num_input_tokens_seen": 738176, "step": 3870 }, { "epoch": 2.014033264033264, "grad_norm": 0.6408542394638062, "learning_rate": 4.9999930465376514e-05, "loss": 0.1038, "num_input_tokens_seen": 739136, "step": 3875 }, { "epoch": 2.0166320166320166, "grad_norm": 0.2441694140434265, "learning_rate": 4.999990114976315e-05, "loss": 0.1164, "num_input_tokens_seen": 740064, "step": 3880 }, { "epoch": 2.019230769230769, "grad_norm": 0.666832685470581, "learning_rate": 4.999986669107437e-05, "loss": 0.1764, "num_input_tokens_seen": 741024, "step": 3885 }, { "epoch": 2.0218295218295217, "grad_norm": 0.21654853224754333, "learning_rate": 4.999982708931727e-05, "loss": 0.1688, "num_input_tokens_seen": 742080, "step": 3890 }, { "epoch": 2.024428274428274, "grad_norm": 0.44288524985313416, "learning_rate": 4.999978234449999e-05, "loss": 0.1106, "num_input_tokens_seen": 743040, "step": 3895 }, { "epoch": 2.027027027027027, "grad_norm": 0.24240563809871674, "learning_rate": 4.999973245663174e-05, "loss": 0.1186, "num_input_tokens_seen": 744000, "step": 3900 }, { "epoch": 2.0296257796257797, "grad_norm": 0.3719214200973511, "learning_rate": 4.9999677425722776e-05, "loss": 0.0784, "num_input_tokens_seen": 744928, "step": 3905 }, { "epoch": 2.0322245322245323, "grad_norm": 0.2342180609703064, "learning_rate": 4.999961725178443e-05, "loss": 0.1333, "num_input_tokens_seen": 745888, "step": 3910 }, { "epoch": 2.034823284823285, "grad_norm": 0.4763653874397278, "learning_rate": 4.999955193482907e-05, "loss": 0.2113, "num_input_tokens_seen": 746752, "step": 3915 }, { "epoch": 2.0374220374220373, "grad_norm": 0.8226672410964966, "learning_rate": 4.999948147487013e-05, "loss": 0.1678, "num_input_tokens_seen": 747744, "step": 3920 }, { "epoch": 2.04002079002079, "grad_norm": 0.15492776036262512, "learning_rate": 4.999940587192213e-05, "loss": 0.113, "num_input_tokens_seen": 748640, "step": 3925 }, { "epoch": 2.042619542619543, "grad_norm": 0.7961555123329163, "learning_rate": 4.9999325126000606e-05, "loss": 0.2081, "num_input_tokens_seen": 749568, "step": 3930 }, { "epoch": 2.0452182952182953, "grad_norm": 0.5907206535339355, "learning_rate": 4.9999239237122165e-05, "loss": 0.2213, "num_input_tokens_seen": 750528, "step": 3935 }, { "epoch": 2.047817047817048, "grad_norm": 0.7823237180709839, "learning_rate": 4.999914820530448e-05, "loss": 0.1029, "num_input_tokens_seen": 751488, "step": 3940 }, { "epoch": 2.0504158004158004, "grad_norm": 0.5400593876838684, "learning_rate": 4.9999052030566284e-05, "loss": 0.3121, "num_input_tokens_seen": 752480, "step": 3945 }, { "epoch": 2.053014553014553, "grad_norm": 0.28538984060287476, "learning_rate": 4.9998950712927354e-05, "loss": 0.219, "num_input_tokens_seen": 753376, "step": 3950 }, { "epoch": 2.0556133056133055, "grad_norm": 0.36136966943740845, "learning_rate": 4.999884425240855e-05, "loss": 0.1713, "num_input_tokens_seen": 754336, "step": 3955 }, { "epoch": 2.0582120582120584, "grad_norm": 0.3673730492591858, "learning_rate": 4.9998732649031746e-05, "loss": 0.168, "num_input_tokens_seen": 755328, "step": 3960 }, { "epoch": 2.060810810810811, "grad_norm": 0.1764601767063141, "learning_rate": 4.999861590281992e-05, "loss": 0.0669, "num_input_tokens_seen": 756256, "step": 3965 }, { "epoch": 2.0634095634095635, "grad_norm": 1.473786473274231, "learning_rate": 4.999849401379709e-05, "loss": 0.2702, "num_input_tokens_seen": 757184, "step": 3970 }, { "epoch": 2.066008316008316, "grad_norm": 0.7246018648147583, "learning_rate": 4.9998366981988334e-05, "loss": 0.2387, "num_input_tokens_seen": 758112, "step": 3975 }, { "epoch": 2.0686070686070686, "grad_norm": 0.4896601140499115, "learning_rate": 4.9998234807419776e-05, "loss": 0.1494, "num_input_tokens_seen": 759008, "step": 3980 }, { "epoch": 2.071205821205821, "grad_norm": 0.3926684856414795, "learning_rate": 4.99980974901186e-05, "loss": 0.2189, "num_input_tokens_seen": 760032, "step": 3985 }, { "epoch": 2.0738045738045736, "grad_norm": 0.5428708791732788, "learning_rate": 4.999795503011308e-05, "loss": 0.1722, "num_input_tokens_seen": 760992, "step": 3990 }, { "epoch": 2.0764033264033266, "grad_norm": 1.1554688215255737, "learning_rate": 4.999780742743251e-05, "loss": 0.1785, "num_input_tokens_seen": 762016, "step": 3995 }, { "epoch": 2.079002079002079, "grad_norm": 1.323967695236206, "learning_rate": 4.999765468210724e-05, "loss": 0.1651, "num_input_tokens_seen": 763008, "step": 4000 }, { "epoch": 2.0816008316008316, "grad_norm": 0.36989375948905945, "learning_rate": 4.9997496794168726e-05, "loss": 0.2357, "num_input_tokens_seen": 763968, "step": 4005 }, { "epoch": 2.084199584199584, "grad_norm": 0.5872231721878052, "learning_rate": 4.9997333763649425e-05, "loss": 0.1045, "num_input_tokens_seen": 764928, "step": 4010 }, { "epoch": 2.0867983367983367, "grad_norm": 0.18398641049861908, "learning_rate": 4.999716559058288e-05, "loss": 0.1142, "num_input_tokens_seen": 765920, "step": 4015 }, { "epoch": 2.0893970893970892, "grad_norm": 0.7292748093605042, "learning_rate": 4.9996992275003694e-05, "loss": 0.155, "num_input_tokens_seen": 766848, "step": 4020 }, { "epoch": 2.091995841995842, "grad_norm": 0.6468285322189331, "learning_rate": 4.999681381694752e-05, "loss": 0.2874, "num_input_tokens_seen": 767744, "step": 4025 }, { "epoch": 2.0945945945945947, "grad_norm": 0.46364349126815796, "learning_rate": 4.999663021645107e-05, "loss": 0.241, "num_input_tokens_seen": 768672, "step": 4030 }, { "epoch": 2.0971933471933473, "grad_norm": 0.49129724502563477, "learning_rate": 4.999644147355211e-05, "loss": 0.133, "num_input_tokens_seen": 769600, "step": 4035 }, { "epoch": 2.0997920997921, "grad_norm": 0.5276790261268616, "learning_rate": 4.9996247588289484e-05, "loss": 0.1245, "num_input_tokens_seen": 770560, "step": 4040 }, { "epoch": 2.1023908523908523, "grad_norm": 0.7511879801750183, "learning_rate": 4.9996048560703067e-05, "loss": 0.3552, "num_input_tokens_seen": 771584, "step": 4045 }, { "epoch": 2.104989604989605, "grad_norm": 0.6658599972724915, "learning_rate": 4.99958443908338e-05, "loss": 0.2868, "num_input_tokens_seen": 772544, "step": 4050 }, { "epoch": 2.1075883575883574, "grad_norm": 0.6419160962104797, "learning_rate": 4.9995635078723705e-05, "loss": 0.0979, "num_input_tokens_seen": 773440, "step": 4055 }, { "epoch": 2.1101871101871104, "grad_norm": 0.29323044419288635, "learning_rate": 4.999542062441581e-05, "loss": 0.1429, "num_input_tokens_seen": 774464, "step": 4060 }, { "epoch": 2.112785862785863, "grad_norm": 0.5869513750076294, "learning_rate": 4.9995201027954264e-05, "loss": 0.18, "num_input_tokens_seen": 775456, "step": 4065 }, { "epoch": 2.1153846153846154, "grad_norm": 0.7719639539718628, "learning_rate": 4.999497628938423e-05, "loss": 0.1973, "num_input_tokens_seen": 776416, "step": 4070 }, { "epoch": 2.117983367983368, "grad_norm": 0.6665998101234436, "learning_rate": 4.999474640875195e-05, "loss": 0.1625, "num_input_tokens_seen": 777408, "step": 4075 }, { "epoch": 2.1205821205821205, "grad_norm": 0.46980786323547363, "learning_rate": 4.9994511386104696e-05, "loss": 0.1979, "num_input_tokens_seen": 778368, "step": 4080 }, { "epoch": 2.123180873180873, "grad_norm": 0.4816768765449524, "learning_rate": 4.999427122149084e-05, "loss": 0.2017, "num_input_tokens_seen": 779296, "step": 4085 }, { "epoch": 2.125779625779626, "grad_norm": 0.28083449602127075, "learning_rate": 4.999402591495978e-05, "loss": 0.0759, "num_input_tokens_seen": 780256, "step": 4090 }, { "epoch": 2.1283783783783785, "grad_norm": 0.32251155376434326, "learning_rate": 4.9993775466561984e-05, "loss": 0.2026, "num_input_tokens_seen": 781152, "step": 4095 }, { "epoch": 2.130977130977131, "grad_norm": 1.2118587493896484, "learning_rate": 4.999351987634897e-05, "loss": 0.2615, "num_input_tokens_seen": 782112, "step": 4100 }, { "epoch": 2.1335758835758836, "grad_norm": 0.21823734045028687, "learning_rate": 4.999325914437332e-05, "loss": 0.1664, "num_input_tokens_seen": 782976, "step": 4105 }, { "epoch": 2.136174636174636, "grad_norm": 1.4655901193618774, "learning_rate": 4.999299327068868e-05, "loss": 0.2852, "num_input_tokens_seen": 783904, "step": 4110 }, { "epoch": 2.1387733887733886, "grad_norm": 0.38344940543174744, "learning_rate": 4.9992722255349744e-05, "loss": 0.1571, "num_input_tokens_seen": 784864, "step": 4115 }, { "epoch": 2.141372141372141, "grad_norm": 1.0587539672851562, "learning_rate": 4.999244609841226e-05, "loss": 0.213, "num_input_tokens_seen": 785856, "step": 4120 }, { "epoch": 2.143970893970894, "grad_norm": 0.6872687935829163, "learning_rate": 4.999216479993305e-05, "loss": 0.0867, "num_input_tokens_seen": 786816, "step": 4125 }, { "epoch": 2.1465696465696467, "grad_norm": 1.7674157619476318, "learning_rate": 4.999187835996997e-05, "loss": 0.219, "num_input_tokens_seen": 787712, "step": 4130 }, { "epoch": 2.149168399168399, "grad_norm": 0.6129436492919922, "learning_rate": 4.9991586778581964e-05, "loss": 0.1977, "num_input_tokens_seen": 788672, "step": 4135 }, { "epoch": 2.1517671517671517, "grad_norm": 1.3241140842437744, "learning_rate": 4.9991290055829e-05, "loss": 0.2153, "num_input_tokens_seen": 789696, "step": 4140 }, { "epoch": 2.1543659043659042, "grad_norm": 0.4467782974243164, "learning_rate": 4.999098819177214e-05, "loss": 0.1661, "num_input_tokens_seen": 790656, "step": 4145 }, { "epoch": 2.156964656964657, "grad_norm": 0.24951563775539398, "learning_rate": 4.999068118647347e-05, "loss": 0.1508, "num_input_tokens_seen": 791616, "step": 4150 }, { "epoch": 2.1595634095634098, "grad_norm": 0.8553125858306885, "learning_rate": 4.999036903999615e-05, "loss": 0.2958, "num_input_tokens_seen": 792608, "step": 4155 }, { "epoch": 2.1621621621621623, "grad_norm": 0.9623565077781677, "learning_rate": 4.99900517524044e-05, "loss": 0.1065, "num_input_tokens_seen": 793536, "step": 4160 }, { "epoch": 2.164760914760915, "grad_norm": 0.4030519425868988, "learning_rate": 4.998972932376349e-05, "loss": 0.1867, "num_input_tokens_seen": 794496, "step": 4165 }, { "epoch": 2.1673596673596673, "grad_norm": 0.33816954493522644, "learning_rate": 4.998940175413976e-05, "loss": 0.0945, "num_input_tokens_seen": 795456, "step": 4170 }, { "epoch": 2.16995841995842, "grad_norm": 0.6206875443458557, "learning_rate": 4.998906904360059e-05, "loss": 0.1679, "num_input_tokens_seen": 796480, "step": 4175 }, { "epoch": 2.1725571725571724, "grad_norm": 0.6221020221710205, "learning_rate": 4.9988731192214425e-05, "loss": 0.1587, "num_input_tokens_seen": 797440, "step": 4180 }, { "epoch": 2.1751559251559254, "grad_norm": 0.7228969931602478, "learning_rate": 4.9988388200050776e-05, "loss": 0.1822, "num_input_tokens_seen": 798400, "step": 4185 }, { "epoch": 2.177754677754678, "grad_norm": 0.4229012131690979, "learning_rate": 4.99880400671802e-05, "loss": 0.2206, "num_input_tokens_seen": 799392, "step": 4190 }, { "epoch": 2.1803534303534304, "grad_norm": 0.2568321228027344, "learning_rate": 4.998768679367432e-05, "loss": 0.1512, "num_input_tokens_seen": 800352, "step": 4195 }, { "epoch": 2.182952182952183, "grad_norm": 1.3893325328826904, "learning_rate": 4.9987328379605816e-05, "loss": 0.2284, "num_input_tokens_seen": 801312, "step": 4200 }, { "epoch": 2.1855509355509355, "grad_norm": 1.0861376523971558, "learning_rate": 4.998696482504841e-05, "loss": 0.2796, "num_input_tokens_seen": 802240, "step": 4205 }, { "epoch": 2.188149688149688, "grad_norm": 1.8523333072662354, "learning_rate": 4.99865961300769e-05, "loss": 0.1727, "num_input_tokens_seen": 803136, "step": 4210 }, { "epoch": 2.1907484407484406, "grad_norm": 0.5318068861961365, "learning_rate": 4.998622229476714e-05, "loss": 0.1811, "num_input_tokens_seen": 804064, "step": 4215 }, { "epoch": 2.1933471933471935, "grad_norm": 0.7144148945808411, "learning_rate": 4.998584331919603e-05, "loss": 0.1982, "num_input_tokens_seen": 805056, "step": 4220 }, { "epoch": 2.195945945945946, "grad_norm": 0.5419634580612183, "learning_rate": 4.9985459203441544e-05, "loss": 0.1384, "num_input_tokens_seen": 806016, "step": 4225 }, { "epoch": 2.1985446985446986, "grad_norm": 0.22420376539230347, "learning_rate": 4.998506994758269e-05, "loss": 0.1118, "num_input_tokens_seen": 806912, "step": 4230 }, { "epoch": 2.201143451143451, "grad_norm": 1.62116539478302, "learning_rate": 4.9984675551699556e-05, "loss": 0.1935, "num_input_tokens_seen": 807744, "step": 4235 }, { "epoch": 2.2037422037422036, "grad_norm": 0.942470908164978, "learning_rate": 4.998427601587328e-05, "loss": 0.2513, "num_input_tokens_seen": 808672, "step": 4240 }, { "epoch": 2.206340956340956, "grad_norm": 0.4801364541053772, "learning_rate": 4.9983871340186045e-05, "loss": 0.1803, "num_input_tokens_seen": 809696, "step": 4245 }, { "epoch": 2.208939708939709, "grad_norm": 0.8304065465927124, "learning_rate": 4.9983461524721115e-05, "loss": 0.1075, "num_input_tokens_seen": 810656, "step": 4250 }, { "epoch": 2.2115384615384617, "grad_norm": 0.540752649307251, "learning_rate": 4.9983046569562794e-05, "loss": 0.1748, "num_input_tokens_seen": 811648, "step": 4255 }, { "epoch": 2.214137214137214, "grad_norm": 0.3304375410079956, "learning_rate": 4.998262647479645e-05, "loss": 0.1108, "num_input_tokens_seen": 812640, "step": 4260 }, { "epoch": 2.2167359667359667, "grad_norm": 0.601550281047821, "learning_rate": 4.99822012405085e-05, "loss": 0.1201, "num_input_tokens_seen": 813632, "step": 4265 }, { "epoch": 2.2193347193347193, "grad_norm": 0.5931054949760437, "learning_rate": 4.9981770866786434e-05, "loss": 0.2003, "num_input_tokens_seen": 814560, "step": 4270 }, { "epoch": 2.221933471933472, "grad_norm": 1.1230049133300781, "learning_rate": 4.998133535371879e-05, "loss": 0.1934, "num_input_tokens_seen": 815552, "step": 4275 }, { "epoch": 2.2245322245322248, "grad_norm": 0.6623509526252747, "learning_rate": 4.998089470139515e-05, "loss": 0.2001, "num_input_tokens_seen": 816416, "step": 4280 }, { "epoch": 2.2271309771309773, "grad_norm": 0.29321157932281494, "learning_rate": 4.998044890990619e-05, "loss": 0.1569, "num_input_tokens_seen": 817408, "step": 4285 }, { "epoch": 2.22972972972973, "grad_norm": 0.2229061722755432, "learning_rate": 4.99799979793436e-05, "loss": 0.1445, "num_input_tokens_seen": 818432, "step": 4290 }, { "epoch": 2.2323284823284824, "grad_norm": 1.0455477237701416, "learning_rate": 4.9979541909800146e-05, "loss": 0.143, "num_input_tokens_seen": 819360, "step": 4295 }, { "epoch": 2.234927234927235, "grad_norm": 0.5637617707252502, "learning_rate": 4.997908070136966e-05, "loss": 0.1374, "num_input_tokens_seen": 820288, "step": 4300 }, { "epoch": 2.2375259875259874, "grad_norm": 0.4628617465496063, "learning_rate": 4.9978614354147024e-05, "loss": 0.0833, "num_input_tokens_seen": 821216, "step": 4305 }, { "epoch": 2.24012474012474, "grad_norm": 0.6290476322174072, "learning_rate": 4.997814286822818e-05, "loss": 0.107, "num_input_tokens_seen": 822176, "step": 4310 }, { "epoch": 2.242723492723493, "grad_norm": 0.8001424074172974, "learning_rate": 4.9977666243710117e-05, "loss": 0.3768, "num_input_tokens_seen": 823168, "step": 4315 }, { "epoch": 2.2453222453222454, "grad_norm": 0.3670869767665863, "learning_rate": 4.997718448069089e-05, "loss": 0.177, "num_input_tokens_seen": 824096, "step": 4320 }, { "epoch": 2.247920997920998, "grad_norm": 0.5523179173469543, "learning_rate": 4.997669757926961e-05, "loss": 0.1449, "num_input_tokens_seen": 825056, "step": 4325 }, { "epoch": 2.2505197505197505, "grad_norm": 0.6809586882591248, "learning_rate": 4.997620553954645e-05, "loss": 0.1088, "num_input_tokens_seen": 825984, "step": 4330 }, { "epoch": 2.253118503118503, "grad_norm": 0.35349515080451965, "learning_rate": 4.997570836162262e-05, "loss": 0.1077, "num_input_tokens_seen": 826912, "step": 4335 }, { "epoch": 2.2557172557172556, "grad_norm": 0.5017203688621521, "learning_rate": 4.997520604560041e-05, "loss": 0.188, "num_input_tokens_seen": 827776, "step": 4340 }, { "epoch": 2.258316008316008, "grad_norm": 1.3008003234863281, "learning_rate": 4.997469859158316e-05, "loss": 0.1926, "num_input_tokens_seen": 828704, "step": 4345 }, { "epoch": 2.260914760914761, "grad_norm": 0.36380141973495483, "learning_rate": 4.997418599967527e-05, "loss": 0.3115, "num_input_tokens_seen": 829600, "step": 4350 }, { "epoch": 2.2635135135135136, "grad_norm": 0.18599846959114075, "learning_rate": 4.9973668269982175e-05, "loss": 0.1427, "num_input_tokens_seen": 830528, "step": 4355 }, { "epoch": 2.266112266112266, "grad_norm": 1.1515635251998901, "learning_rate": 4.99731454026104e-05, "loss": 0.1576, "num_input_tokens_seen": 831488, "step": 4360 }, { "epoch": 2.2687110187110187, "grad_norm": 1.0521996021270752, "learning_rate": 4.997261739766751e-05, "loss": 0.2207, "num_input_tokens_seen": 832416, "step": 4365 }, { "epoch": 2.271309771309771, "grad_norm": 0.5076079964637756, "learning_rate": 4.997208425526212e-05, "loss": 0.1559, "num_input_tokens_seen": 833376, "step": 4370 }, { "epoch": 2.2739085239085237, "grad_norm": 1.084655523300171, "learning_rate": 4.997154597550392e-05, "loss": 0.2606, "num_input_tokens_seen": 834240, "step": 4375 }, { "epoch": 2.2765072765072767, "grad_norm": 1.2387597560882568, "learning_rate": 4.9971002558503634e-05, "loss": 0.199, "num_input_tokens_seen": 835200, "step": 4380 }, { "epoch": 2.279106029106029, "grad_norm": 0.5756189823150635, "learning_rate": 4.997045400437307e-05, "loss": 0.111, "num_input_tokens_seen": 836096, "step": 4385 }, { "epoch": 2.2817047817047817, "grad_norm": 1.2298381328582764, "learning_rate": 4.996990031322506e-05, "loss": 0.2227, "num_input_tokens_seen": 836992, "step": 4390 }, { "epoch": 2.2843035343035343, "grad_norm": 1.4134010076522827, "learning_rate": 4.9969341485173535e-05, "loss": 0.211, "num_input_tokens_seen": 837888, "step": 4395 }, { "epoch": 2.286902286902287, "grad_norm": 0.23387344181537628, "learning_rate": 4.996877752033345e-05, "loss": 0.1422, "num_input_tokens_seen": 838848, "step": 4400 }, { "epoch": 2.2895010395010393, "grad_norm": 0.3158842623233795, "learning_rate": 4.996820841882082e-05, "loss": 0.0906, "num_input_tokens_seen": 839808, "step": 4405 }, { "epoch": 2.2920997920997923, "grad_norm": 0.3088211119174957, "learning_rate": 4.996763418075272e-05, "loss": 0.1651, "num_input_tokens_seen": 840768, "step": 4410 }, { "epoch": 2.294698544698545, "grad_norm": 1.093672275543213, "learning_rate": 4.996705480624731e-05, "loss": 0.2159, "num_input_tokens_seen": 841728, "step": 4415 }, { "epoch": 2.2972972972972974, "grad_norm": 0.3543796241283417, "learning_rate": 4.996647029542374e-05, "loss": 0.1437, "num_input_tokens_seen": 842688, "step": 4420 }, { "epoch": 2.29989604989605, "grad_norm": 0.27083688974380493, "learning_rate": 4.9965880648402285e-05, "loss": 0.1477, "num_input_tokens_seen": 843584, "step": 4425 }, { "epoch": 2.3024948024948024, "grad_norm": 0.2578639090061188, "learning_rate": 4.9965285865304254e-05, "loss": 0.1303, "num_input_tokens_seen": 844480, "step": 4430 }, { "epoch": 2.305093555093555, "grad_norm": 1.25508451461792, "learning_rate": 4.9964685946251986e-05, "loss": 0.2089, "num_input_tokens_seen": 845472, "step": 4435 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5499305725097656, "learning_rate": 4.9964080891368925e-05, "loss": 0.1396, "num_input_tokens_seen": 846432, "step": 4440 }, { "epoch": 2.3102910602910605, "grad_norm": 2.1331064701080322, "learning_rate": 4.9963470700779524e-05, "loss": 0.1697, "num_input_tokens_seen": 847392, "step": 4445 }, { "epoch": 2.312889812889813, "grad_norm": 1.2702034711837769, "learning_rate": 4.996285537460932e-05, "loss": 0.2044, "num_input_tokens_seen": 848288, "step": 4450 }, { "epoch": 2.3154885654885655, "grad_norm": 0.6289039254188538, "learning_rate": 4.9962234912984905e-05, "loss": 0.2279, "num_input_tokens_seen": 849280, "step": 4455 }, { "epoch": 2.318087318087318, "grad_norm": 0.19911062717437744, "learning_rate": 4.996160931603391e-05, "loss": 0.231, "num_input_tokens_seen": 850176, "step": 4460 }, { "epoch": 2.3206860706860706, "grad_norm": 0.3845936954021454, "learning_rate": 4.996097858388506e-05, "loss": 0.0682, "num_input_tokens_seen": 851104, "step": 4465 }, { "epoch": 2.323284823284823, "grad_norm": 0.7105973362922668, "learning_rate": 4.996034271666808e-05, "loss": 0.2228, "num_input_tokens_seen": 852064, "step": 4470 }, { "epoch": 2.3258835758835756, "grad_norm": 1.184112787246704, "learning_rate": 4.995970171451381e-05, "loss": 0.1723, "num_input_tokens_seen": 852992, "step": 4475 }, { "epoch": 2.3284823284823286, "grad_norm": 1.3212097883224487, "learning_rate": 4.99590555775541e-05, "loss": 0.145, "num_input_tokens_seen": 853952, "step": 4480 }, { "epoch": 2.331081081081081, "grad_norm": 1.0076314210891724, "learning_rate": 4.995840430592189e-05, "loss": 0.0965, "num_input_tokens_seen": 854976, "step": 4485 }, { "epoch": 2.3336798336798337, "grad_norm": 0.6515754461288452, "learning_rate": 4.995774789975116e-05, "loss": 0.1583, "num_input_tokens_seen": 855936, "step": 4490 }, { "epoch": 2.336278586278586, "grad_norm": 0.28563833236694336, "learning_rate": 4.995708635917694e-05, "loss": 0.2085, "num_input_tokens_seen": 856928, "step": 4495 }, { "epoch": 2.3388773388773387, "grad_norm": 0.6832842826843262, "learning_rate": 4.9956419684335327e-05, "loss": 0.1644, "num_input_tokens_seen": 857984, "step": 4500 }, { "epoch": 2.3414760914760917, "grad_norm": 1.054660677909851, "learning_rate": 4.995574787536348e-05, "loss": 0.3082, "num_input_tokens_seen": 859008, "step": 4505 }, { "epoch": 2.3440748440748442, "grad_norm": 1.3565945625305176, "learning_rate": 4.9955070932399594e-05, "loss": 0.2037, "num_input_tokens_seen": 859936, "step": 4510 }, { "epoch": 2.3466735966735968, "grad_norm": 0.25994426012039185, "learning_rate": 4.995438885558294e-05, "loss": 0.1687, "num_input_tokens_seen": 860896, "step": 4515 }, { "epoch": 2.3492723492723493, "grad_norm": 0.435086190700531, "learning_rate": 4.995370164505383e-05, "loss": 0.1837, "num_input_tokens_seen": 861856, "step": 4520 }, { "epoch": 2.351871101871102, "grad_norm": 0.6707311868667603, "learning_rate": 4.995300930095366e-05, "loss": 0.1702, "num_input_tokens_seen": 862816, "step": 4525 }, { "epoch": 2.3544698544698544, "grad_norm": 0.39000698924064636, "learning_rate": 4.9952311823424834e-05, "loss": 0.1384, "num_input_tokens_seen": 863776, "step": 4530 }, { "epoch": 2.357068607068607, "grad_norm": 0.29309138655662537, "learning_rate": 4.995160921261086e-05, "loss": 0.1461, "num_input_tokens_seen": 864672, "step": 4535 }, { "epoch": 2.35966735966736, "grad_norm": 0.6223357915878296, "learning_rate": 4.9950901468656275e-05, "loss": 0.1616, "num_input_tokens_seen": 865600, "step": 4540 }, { "epoch": 2.3622661122661124, "grad_norm": 0.8010420799255371, "learning_rate": 4.995018859170668e-05, "loss": 0.0864, "num_input_tokens_seen": 866592, "step": 4545 }, { "epoch": 2.364864864864865, "grad_norm": 0.363442063331604, "learning_rate": 4.994947058190873e-05, "loss": 0.1277, "num_input_tokens_seen": 867584, "step": 4550 }, { "epoch": 2.3674636174636174, "grad_norm": 0.6200454235076904, "learning_rate": 4.994874743941013e-05, "loss": 0.19, "num_input_tokens_seen": 868576, "step": 4555 }, { "epoch": 2.37006237006237, "grad_norm": 0.3683689832687378, "learning_rate": 4.994801916435966e-05, "loss": 0.0899, "num_input_tokens_seen": 869536, "step": 4560 }, { "epoch": 2.3726611226611225, "grad_norm": 0.44364893436431885, "learning_rate": 4.994728575690714e-05, "loss": 0.1516, "num_input_tokens_seen": 870496, "step": 4565 }, { "epoch": 2.375259875259875, "grad_norm": 0.7135822176933289, "learning_rate": 4.9946547217203434e-05, "loss": 0.2691, "num_input_tokens_seen": 871456, "step": 4570 }, { "epoch": 2.377858627858628, "grad_norm": 0.9818466901779175, "learning_rate": 4.99458035454005e-05, "loss": 0.1399, "num_input_tokens_seen": 872416, "step": 4575 }, { "epoch": 2.3804573804573805, "grad_norm": 0.42218703031539917, "learning_rate": 4.9945054741651306e-05, "loss": 0.0962, "num_input_tokens_seen": 873280, "step": 4580 }, { "epoch": 2.383056133056133, "grad_norm": 0.3212207853794098, "learning_rate": 4.994430080610993e-05, "loss": 0.2019, "num_input_tokens_seen": 874208, "step": 4585 }, { "epoch": 2.3856548856548856, "grad_norm": 0.384489506483078, "learning_rate": 4.9943541738931444e-05, "loss": 0.1108, "num_input_tokens_seen": 875136, "step": 4590 }, { "epoch": 2.388253638253638, "grad_norm": 0.23848001658916473, "learning_rate": 4.9942777540272026e-05, "loss": 0.1026, "num_input_tokens_seen": 876064, "step": 4595 }, { "epoch": 2.390852390852391, "grad_norm": 0.25929200649261475, "learning_rate": 4.9942008210288884e-05, "loss": 0.0684, "num_input_tokens_seen": 876928, "step": 4600 }, { "epoch": 2.3934511434511436, "grad_norm": 0.25100383162498474, "learning_rate": 4.994123374914028e-05, "loss": 0.226, "num_input_tokens_seen": 877920, "step": 4605 }, { "epoch": 2.396049896049896, "grad_norm": 0.8617314696311951, "learning_rate": 4.994045415698554e-05, "loss": 0.2046, "num_input_tokens_seen": 878784, "step": 4610 }, { "epoch": 2.3986486486486487, "grad_norm": 0.9697505831718445, "learning_rate": 4.9939669433985056e-05, "loss": 0.2385, "num_input_tokens_seen": 879680, "step": 4615 }, { "epoch": 2.401247401247401, "grad_norm": 0.24100139737129211, "learning_rate": 4.993887958030026e-05, "loss": 0.1908, "num_input_tokens_seen": 880640, "step": 4620 }, { "epoch": 2.4038461538461537, "grad_norm": 0.4881550669670105, "learning_rate": 4.9938084596093635e-05, "loss": 0.2424, "num_input_tokens_seen": 881536, "step": 4625 }, { "epoch": 2.4064449064449063, "grad_norm": 1.2163348197937012, "learning_rate": 4.993728448152874e-05, "loss": 0.2449, "num_input_tokens_seen": 882464, "step": 4630 }, { "epoch": 2.4090436590436592, "grad_norm": 0.5042659044265747, "learning_rate": 4.993647923677017e-05, "loss": 0.1277, "num_input_tokens_seen": 883488, "step": 4635 }, { "epoch": 2.4116424116424118, "grad_norm": 0.668067216873169, "learning_rate": 4.993566886198359e-05, "loss": 0.1224, "num_input_tokens_seen": 884384, "step": 4640 }, { "epoch": 2.4142411642411643, "grad_norm": 0.4582677185535431, "learning_rate": 4.9934853357335696e-05, "loss": 0.1407, "num_input_tokens_seen": 885408, "step": 4645 }, { "epoch": 2.416839916839917, "grad_norm": 1.0441548824310303, "learning_rate": 4.9934032722994284e-05, "loss": 0.2308, "num_input_tokens_seen": 886400, "step": 4650 }, { "epoch": 2.4194386694386694, "grad_norm": 0.4319557547569275, "learning_rate": 4.993320695912816e-05, "loss": 0.0997, "num_input_tokens_seen": 887360, "step": 4655 }, { "epoch": 2.422037422037422, "grad_norm": 1.1691232919692993, "learning_rate": 4.9932376065907204e-05, "loss": 0.1177, "num_input_tokens_seen": 888288, "step": 4660 }, { "epoch": 2.4246361746361744, "grad_norm": 0.2709220051765442, "learning_rate": 4.9931540043502356e-05, "loss": 0.1586, "num_input_tokens_seen": 889152, "step": 4665 }, { "epoch": 2.4272349272349274, "grad_norm": 1.611160159111023, "learning_rate": 4.99306988920856e-05, "loss": 0.2108, "num_input_tokens_seen": 890176, "step": 4670 }, { "epoch": 2.42983367983368, "grad_norm": 0.24994683265686035, "learning_rate": 4.9929852611829983e-05, "loss": 0.1102, "num_input_tokens_seen": 891072, "step": 4675 }, { "epoch": 2.4324324324324325, "grad_norm": 0.42388927936553955, "learning_rate": 4.9929001202909614e-05, "loss": 0.0982, "num_input_tokens_seen": 892032, "step": 4680 }, { "epoch": 2.435031185031185, "grad_norm": 0.37202340364456177, "learning_rate": 4.992814466549963e-05, "loss": 0.1399, "num_input_tokens_seen": 892992, "step": 4685 }, { "epoch": 2.4376299376299375, "grad_norm": 0.701199471950531, "learning_rate": 4.992728299977626e-05, "loss": 0.2043, "num_input_tokens_seen": 894016, "step": 4690 }, { "epoch": 2.44022869022869, "grad_norm": 0.5390922427177429, "learning_rate": 4.992641620591676e-05, "loss": 0.2411, "num_input_tokens_seen": 895072, "step": 4695 }, { "epoch": 2.442827442827443, "grad_norm": 0.3057391345500946, "learning_rate": 4.992554428409945e-05, "loss": 0.0738, "num_input_tokens_seen": 896000, "step": 4700 }, { "epoch": 2.4454261954261955, "grad_norm": 0.31043797731399536, "learning_rate": 4.99246672345037e-05, "loss": 0.1069, "num_input_tokens_seen": 896896, "step": 4705 }, { "epoch": 2.448024948024948, "grad_norm": 0.5079529881477356, "learning_rate": 4.992378505730995e-05, "loss": 0.2039, "num_input_tokens_seen": 897856, "step": 4710 }, { "epoch": 2.4506237006237006, "grad_norm": 0.4462940990924835, "learning_rate": 4.992289775269968e-05, "loss": 0.1553, "num_input_tokens_seen": 898848, "step": 4715 }, { "epoch": 2.453222453222453, "grad_norm": 0.1988169103860855, "learning_rate": 4.992200532085543e-05, "loss": 0.1677, "num_input_tokens_seen": 899872, "step": 4720 }, { "epoch": 2.4558212058212057, "grad_norm": 0.4395054280757904, "learning_rate": 4.99211077619608e-05, "loss": 0.1437, "num_input_tokens_seen": 900864, "step": 4725 }, { "epoch": 2.4584199584199586, "grad_norm": 0.25130730867385864, "learning_rate": 4.992020507620043e-05, "loss": 0.0855, "num_input_tokens_seen": 901824, "step": 4730 }, { "epoch": 2.461018711018711, "grad_norm": 0.5002604126930237, "learning_rate": 4.991929726376002e-05, "loss": 0.0807, "num_input_tokens_seen": 902784, "step": 4735 }, { "epoch": 2.4636174636174637, "grad_norm": 0.3203350305557251, "learning_rate": 4.991838432482635e-05, "loss": 0.1787, "num_input_tokens_seen": 903840, "step": 4740 }, { "epoch": 2.4662162162162162, "grad_norm": 0.36800557374954224, "learning_rate": 4.991746625958721e-05, "loss": 0.208, "num_input_tokens_seen": 904832, "step": 4745 }, { "epoch": 2.4688149688149688, "grad_norm": 0.6736375093460083, "learning_rate": 4.991654306823148e-05, "loss": 0.1762, "num_input_tokens_seen": 905824, "step": 4750 }, { "epoch": 2.4714137214137213, "grad_norm": 0.5160521268844604, "learning_rate": 4.991561475094909e-05, "loss": 0.1804, "num_input_tokens_seen": 906784, "step": 4755 }, { "epoch": 2.474012474012474, "grad_norm": 0.6836442351341248, "learning_rate": 4.9914681307931e-05, "loss": 0.115, "num_input_tokens_seen": 907776, "step": 4760 }, { "epoch": 2.476611226611227, "grad_norm": 0.0830274298787117, "learning_rate": 4.991374273936924e-05, "loss": 0.1497, "num_input_tokens_seen": 908704, "step": 4765 }, { "epoch": 2.4792099792099793, "grad_norm": 0.27874717116355896, "learning_rate": 4.991279904545692e-05, "loss": 0.2352, "num_input_tokens_seen": 909760, "step": 4770 }, { "epoch": 2.481808731808732, "grad_norm": 0.5799997448921204, "learning_rate": 4.991185022638816e-05, "loss": 0.268, "num_input_tokens_seen": 910752, "step": 4775 }, { "epoch": 2.4844074844074844, "grad_norm": 1.0724658966064453, "learning_rate": 4.9910896282358154e-05, "loss": 0.2908, "num_input_tokens_seen": 911712, "step": 4780 }, { "epoch": 2.487006237006237, "grad_norm": 0.5187085270881653, "learning_rate": 4.9909937213563165e-05, "loss": 0.1388, "num_input_tokens_seen": 912672, "step": 4785 }, { "epoch": 2.4896049896049894, "grad_norm": 0.33571866154670715, "learning_rate": 4.990897302020048e-05, "loss": 0.1301, "num_input_tokens_seen": 913632, "step": 4790 }, { "epoch": 2.492203742203742, "grad_norm": 0.4850694239139557, "learning_rate": 4.990800370246848e-05, "loss": 0.2119, "num_input_tokens_seen": 914560, "step": 4795 }, { "epoch": 2.494802494802495, "grad_norm": 0.3001425266265869, "learning_rate": 4.990702926056654e-05, "loss": 0.1403, "num_input_tokens_seen": 915520, "step": 4800 }, { "epoch": 2.4974012474012475, "grad_norm": 0.2746588885784149, "learning_rate": 4.990604969469516e-05, "loss": 0.2888, "num_input_tokens_seen": 916448, "step": 4805 }, { "epoch": 2.5, "grad_norm": 1.0271190404891968, "learning_rate": 4.9905065005055853e-05, "loss": 0.204, "num_input_tokens_seen": 917408, "step": 4810 }, { "epoch": 2.5025987525987525, "grad_norm": 0.5580503344535828, "learning_rate": 4.9904075191851174e-05, "loss": 0.2025, "num_input_tokens_seen": 918368, "step": 4815 }, { "epoch": 2.505197505197505, "grad_norm": 0.11414411664009094, "learning_rate": 4.990308025528477e-05, "loss": 0.0663, "num_input_tokens_seen": 919392, "step": 4820 }, { "epoch": 2.507796257796258, "grad_norm": 0.8812574744224548, "learning_rate": 4.990208019556132e-05, "loss": 0.2781, "num_input_tokens_seen": 920416, "step": 4825 }, { "epoch": 2.51039501039501, "grad_norm": 0.5786721110343933, "learning_rate": 4.9901075012886557e-05, "loss": 0.167, "num_input_tokens_seen": 921440, "step": 4830 }, { "epoch": 2.512993762993763, "grad_norm": 1.2720566987991333, "learning_rate": 4.990006470746727e-05, "loss": 0.122, "num_input_tokens_seen": 922336, "step": 4835 }, { "epoch": 2.5155925155925156, "grad_norm": 0.8128222823143005, "learning_rate": 4.98990492795113e-05, "loss": 0.239, "num_input_tokens_seen": 923232, "step": 4840 }, { "epoch": 2.518191268191268, "grad_norm": 0.2440030872821808, "learning_rate": 4.989802872922756e-05, "loss": 0.1093, "num_input_tokens_seen": 924352, "step": 4845 }, { "epoch": 2.5207900207900207, "grad_norm": 1.1386722326278687, "learning_rate": 4.9897003056825985e-05, "loss": 0.1904, "num_input_tokens_seen": 925344, "step": 4850 }, { "epoch": 2.523388773388773, "grad_norm": 1.2216869592666626, "learning_rate": 4.9895972262517596e-05, "loss": 0.2026, "num_input_tokens_seen": 926240, "step": 4855 }, { "epoch": 2.525987525987526, "grad_norm": 0.5520274043083191, "learning_rate": 4.989493634651443e-05, "loss": 0.1492, "num_input_tokens_seen": 927136, "step": 4860 }, { "epoch": 2.5285862785862787, "grad_norm": 0.3857842981815338, "learning_rate": 4.9893895309029616e-05, "loss": 0.2076, "num_input_tokens_seen": 928128, "step": 4865 }, { "epoch": 2.5311850311850312, "grad_norm": 0.5521339774131775, "learning_rate": 4.989284915027732e-05, "loss": 0.199, "num_input_tokens_seen": 929024, "step": 4870 }, { "epoch": 2.5337837837837838, "grad_norm": 1.028639554977417, "learning_rate": 4.9891797870472755e-05, "loss": 0.1876, "num_input_tokens_seen": 930080, "step": 4875 }, { "epoch": 2.5363825363825363, "grad_norm": 0.3752289414405823, "learning_rate": 4.989074146983219e-05, "loss": 0.0796, "num_input_tokens_seen": 931008, "step": 4880 }, { "epoch": 2.538981288981289, "grad_norm": 0.6333829164505005, "learning_rate": 4.9889679948572974e-05, "loss": 0.1192, "num_input_tokens_seen": 931968, "step": 4885 }, { "epoch": 2.5415800415800414, "grad_norm": 0.44516193866729736, "learning_rate": 4.9888613306913455e-05, "loss": 0.1474, "num_input_tokens_seen": 932928, "step": 4890 }, { "epoch": 2.5441787941787943, "grad_norm": 0.9392460584640503, "learning_rate": 4.98875415450731e-05, "loss": 0.1618, "num_input_tokens_seen": 933792, "step": 4895 }, { "epoch": 2.546777546777547, "grad_norm": 1.1863723993301392, "learning_rate": 4.988646466327237e-05, "loss": 0.1882, "num_input_tokens_seen": 934752, "step": 4900 }, { "epoch": 2.5493762993762994, "grad_norm": 0.9171858429908752, "learning_rate": 4.9885382661732814e-05, "loss": 0.1638, "num_input_tokens_seen": 935776, "step": 4905 }, { "epoch": 2.551975051975052, "grad_norm": 1.1400285959243774, "learning_rate": 4.988429554067703e-05, "loss": 0.1747, "num_input_tokens_seen": 936800, "step": 4910 }, { "epoch": 2.5545738045738045, "grad_norm": 0.660295844078064, "learning_rate": 4.988320330032867e-05, "loss": 0.059, "num_input_tokens_seen": 937696, "step": 4915 }, { "epoch": 2.5571725571725574, "grad_norm": 0.9109883308410645, "learning_rate": 4.9882105940912416e-05, "loss": 0.2702, "num_input_tokens_seen": 938720, "step": 4920 }, { "epoch": 2.5597713097713095, "grad_norm": 0.783142626285553, "learning_rate": 4.988100346265403e-05, "loss": 0.2094, "num_input_tokens_seen": 939648, "step": 4925 }, { "epoch": 2.5623700623700625, "grad_norm": 0.6491544246673584, "learning_rate": 4.987989586578032e-05, "loss": 0.113, "num_input_tokens_seen": 940544, "step": 4930 }, { "epoch": 2.564968814968815, "grad_norm": 0.24680189788341522, "learning_rate": 4.987878315051914e-05, "loss": 0.1224, "num_input_tokens_seen": 941440, "step": 4935 }, { "epoch": 2.5675675675675675, "grad_norm": 1.4662798643112183, "learning_rate": 4.987766531709941e-05, "loss": 0.2202, "num_input_tokens_seen": 942368, "step": 4940 }, { "epoch": 2.57016632016632, "grad_norm": 1.0987838506698608, "learning_rate": 4.987654236575109e-05, "loss": 0.1596, "num_input_tokens_seen": 943328, "step": 4945 }, { "epoch": 2.5727650727650726, "grad_norm": 1.1863731145858765, "learning_rate": 4.987541429670519e-05, "loss": 0.217, "num_input_tokens_seen": 944320, "step": 4950 }, { "epoch": 2.5753638253638256, "grad_norm": 0.6003744602203369, "learning_rate": 4.98742811101938e-05, "loss": 0.1328, "num_input_tokens_seen": 945312, "step": 4955 }, { "epoch": 2.577962577962578, "grad_norm": 0.45368990302085876, "learning_rate": 4.9873142806450023e-05, "loss": 0.1398, "num_input_tokens_seen": 946272, "step": 4960 }, { "epoch": 2.5805613305613306, "grad_norm": 0.8963382840156555, "learning_rate": 4.9871999385708043e-05, "loss": 0.149, "num_input_tokens_seen": 947136, "step": 4965 }, { "epoch": 2.583160083160083, "grad_norm": 0.8215468525886536, "learning_rate": 4.98708508482031e-05, "loss": 0.2025, "num_input_tokens_seen": 948032, "step": 4970 }, { "epoch": 2.5857588357588357, "grad_norm": 0.24691596627235413, "learning_rate": 4.986969719417146e-05, "loss": 0.102, "num_input_tokens_seen": 948960, "step": 4975 }, { "epoch": 2.5883575883575882, "grad_norm": 0.32535937428474426, "learning_rate": 4.986853842385047e-05, "loss": 0.1276, "num_input_tokens_seen": 949920, "step": 4980 }, { "epoch": 2.5909563409563408, "grad_norm": 0.2872433364391327, "learning_rate": 4.986737453747851e-05, "loss": 0.1581, "num_input_tokens_seen": 950912, "step": 4985 }, { "epoch": 2.5935550935550937, "grad_norm": 0.5701714158058167, "learning_rate": 4.986620553529501e-05, "loss": 0.1161, "num_input_tokens_seen": 951872, "step": 4990 }, { "epoch": 2.5961538461538463, "grad_norm": 0.4002224802970886, "learning_rate": 4.986503141754047e-05, "loss": 0.1905, "num_input_tokens_seen": 952864, "step": 4995 }, { "epoch": 2.598752598752599, "grad_norm": 0.7081998586654663, "learning_rate": 4.986385218445644e-05, "loss": 0.2551, "num_input_tokens_seen": 953824, "step": 5000 }, { "epoch": 2.6013513513513513, "grad_norm": 1.1404179334640503, "learning_rate": 4.9862667836285504e-05, "loss": 0.2357, "num_input_tokens_seen": 954880, "step": 5005 }, { "epoch": 2.603950103950104, "grad_norm": 0.2382691502571106, "learning_rate": 4.986147837327132e-05, "loss": 0.1591, "num_input_tokens_seen": 955872, "step": 5010 }, { "epoch": 2.606548856548857, "grad_norm": 0.6326413154602051, "learning_rate": 4.986028379565859e-05, "loss": 0.198, "num_input_tokens_seen": 956832, "step": 5015 }, { "epoch": 2.609147609147609, "grad_norm": 0.2164658010005951, "learning_rate": 4.985908410369306e-05, "loss": 0.158, "num_input_tokens_seen": 957792, "step": 5020 }, { "epoch": 2.611746361746362, "grad_norm": 0.489403635263443, "learning_rate": 4.9857879297621545e-05, "loss": 0.0861, "num_input_tokens_seen": 958752, "step": 5025 }, { "epoch": 2.6143451143451144, "grad_norm": 0.7270484566688538, "learning_rate": 4.98566693776919e-05, "loss": 0.2107, "num_input_tokens_seen": 959680, "step": 5030 }, { "epoch": 2.616943866943867, "grad_norm": 0.2930031418800354, "learning_rate": 4.985545434415302e-05, "loss": 0.103, "num_input_tokens_seen": 960576, "step": 5035 }, { "epoch": 2.6195426195426195, "grad_norm": 0.3535689413547516, "learning_rate": 4.985423419725488e-05, "loss": 0.1311, "num_input_tokens_seen": 961536, "step": 5040 }, { "epoch": 2.622141372141372, "grad_norm": 0.24650491774082184, "learning_rate": 4.985300893724848e-05, "loss": 0.1004, "num_input_tokens_seen": 962464, "step": 5045 }, { "epoch": 2.624740124740125, "grad_norm": 1.120479941368103, "learning_rate": 4.985177856438591e-05, "loss": 0.1221, "num_input_tokens_seen": 963424, "step": 5050 }, { "epoch": 2.6273388773388775, "grad_norm": 0.7910293936729431, "learning_rate": 4.9850543078920266e-05, "loss": 0.221, "num_input_tokens_seen": 964416, "step": 5055 }, { "epoch": 2.62993762993763, "grad_norm": 0.13506470620632172, "learning_rate": 4.984930248110572e-05, "loss": 0.0483, "num_input_tokens_seen": 965344, "step": 5060 }, { "epoch": 2.6325363825363826, "grad_norm": 1.8119910955429077, "learning_rate": 4.98480567711975e-05, "loss": 0.132, "num_input_tokens_seen": 966240, "step": 5065 }, { "epoch": 2.635135135135135, "grad_norm": 0.43700510263442993, "learning_rate": 4.984680594945187e-05, "loss": 0.1929, "num_input_tokens_seen": 967200, "step": 5070 }, { "epoch": 2.6377338877338876, "grad_norm": 0.46889469027519226, "learning_rate": 4.9845550016126165e-05, "loss": 0.1509, "num_input_tokens_seen": 968128, "step": 5075 }, { "epoch": 2.64033264033264, "grad_norm": 0.8400687575340271, "learning_rate": 4.984428897147875e-05, "loss": 0.2868, "num_input_tokens_seen": 969152, "step": 5080 }, { "epoch": 2.642931392931393, "grad_norm": 1.1910600662231445, "learning_rate": 4.9843022815769046e-05, "loss": 0.1841, "num_input_tokens_seen": 970176, "step": 5085 }, { "epoch": 2.6455301455301456, "grad_norm": 0.6185735464096069, "learning_rate": 4.9841751549257546e-05, "loss": 0.1574, "num_input_tokens_seen": 971136, "step": 5090 }, { "epoch": 2.648128898128898, "grad_norm": 0.9305204749107361, "learning_rate": 4.984047517220578e-05, "loss": 0.3021, "num_input_tokens_seen": 972096, "step": 5095 }, { "epoch": 2.6507276507276507, "grad_norm": 0.276631236076355, "learning_rate": 4.983919368487632e-05, "loss": 0.1323, "num_input_tokens_seen": 973024, "step": 5100 }, { "epoch": 2.6533264033264032, "grad_norm": 0.555591344833374, "learning_rate": 4.9837907087532797e-05, "loss": 0.1054, "num_input_tokens_seen": 973984, "step": 5105 }, { "epoch": 2.6559251559251558, "grad_norm": 0.43051448464393616, "learning_rate": 4.98366153804399e-05, "loss": 0.1541, "num_input_tokens_seen": 974976, "step": 5110 }, { "epoch": 2.6585239085239083, "grad_norm": 0.7182730436325073, "learning_rate": 4.9835318563863365e-05, "loss": 0.2038, "num_input_tokens_seen": 975936, "step": 5115 }, { "epoch": 2.6611226611226613, "grad_norm": 0.28291743993759155, "learning_rate": 4.983401663806998e-05, "loss": 0.1857, "num_input_tokens_seen": 976800, "step": 5120 }, { "epoch": 2.663721413721414, "grad_norm": 1.2194076776504517, "learning_rate": 4.9832709603327575e-05, "loss": 0.2515, "num_input_tokens_seen": 977664, "step": 5125 }, { "epoch": 2.6663201663201663, "grad_norm": 0.17392660677433014, "learning_rate": 4.983139745990505e-05, "loss": 0.1391, "num_input_tokens_seen": 978688, "step": 5130 }, { "epoch": 2.668918918918919, "grad_norm": 0.340421199798584, "learning_rate": 4.983008020807233e-05, "loss": 0.1065, "num_input_tokens_seen": 979584, "step": 5135 }, { "epoch": 2.6715176715176714, "grad_norm": 0.34290412068367004, "learning_rate": 4.982875784810041e-05, "loss": 0.1214, "num_input_tokens_seen": 980480, "step": 5140 }, { "epoch": 2.6741164241164244, "grad_norm": 0.20918510854244232, "learning_rate": 4.982743038026133e-05, "loss": 0.1484, "num_input_tokens_seen": 981344, "step": 5145 }, { "epoch": 2.6767151767151764, "grad_norm": 1.0044227838516235, "learning_rate": 4.982609780482819e-05, "loss": 0.1735, "num_input_tokens_seen": 982272, "step": 5150 }, { "epoch": 2.6793139293139294, "grad_norm": 0.22908999025821686, "learning_rate": 4.982476012207512e-05, "loss": 0.2078, "num_input_tokens_seen": 983200, "step": 5155 }, { "epoch": 2.681912681912682, "grad_norm": 0.15350055694580078, "learning_rate": 4.9823417332277325e-05, "loss": 0.1436, "num_input_tokens_seen": 984224, "step": 5160 }, { "epoch": 2.6845114345114345, "grad_norm": 0.6356881856918335, "learning_rate": 4.982206943571104e-05, "loss": 0.1541, "num_input_tokens_seen": 985312, "step": 5165 }, { "epoch": 2.687110187110187, "grad_norm": 0.5981693863868713, "learning_rate": 4.982071643265356e-05, "loss": 0.1072, "num_input_tokens_seen": 986240, "step": 5170 }, { "epoch": 2.6897089397089395, "grad_norm": 0.6272427439689636, "learning_rate": 4.981935832338324e-05, "loss": 0.1878, "num_input_tokens_seen": 987232, "step": 5175 }, { "epoch": 2.6923076923076925, "grad_norm": 0.7114160060882568, "learning_rate": 4.981799510817946e-05, "loss": 0.3676, "num_input_tokens_seen": 988192, "step": 5180 }, { "epoch": 2.694906444906445, "grad_norm": 0.30374082922935486, "learning_rate": 4.981662678732268e-05, "loss": 0.1431, "num_input_tokens_seen": 989184, "step": 5185 }, { "epoch": 2.6975051975051976, "grad_norm": 1.1855171918869019, "learning_rate": 4.981525336109439e-05, "loss": 0.2007, "num_input_tokens_seen": 990144, "step": 5190 }, { "epoch": 2.70010395010395, "grad_norm": 0.6373949646949768, "learning_rate": 4.9813874829777134e-05, "loss": 0.219, "num_input_tokens_seen": 991040, "step": 5195 }, { "epoch": 2.7027027027027026, "grad_norm": 0.7215175628662109, "learning_rate": 4.9812491193654513e-05, "loss": 0.1483, "num_input_tokens_seen": 992096, "step": 5200 }, { "epoch": 2.705301455301455, "grad_norm": 0.8788598775863647, "learning_rate": 4.981110245301118e-05, "loss": 0.1935, "num_input_tokens_seen": 993024, "step": 5205 }, { "epoch": 2.7079002079002077, "grad_norm": 0.5508098602294922, "learning_rate": 4.980970860813281e-05, "loss": 0.1131, "num_input_tokens_seen": 994048, "step": 5210 }, { "epoch": 2.7104989604989607, "grad_norm": 0.21170297265052795, "learning_rate": 4.9808309659306174e-05, "loss": 0.1195, "num_input_tokens_seen": 995072, "step": 5215 }, { "epoch": 2.713097713097713, "grad_norm": 0.3526179790496826, "learning_rate": 4.9806905606819065e-05, "loss": 0.1139, "num_input_tokens_seen": 996064, "step": 5220 }, { "epoch": 2.7156964656964657, "grad_norm": 0.35122525691986084, "learning_rate": 4.980549645096031e-05, "loss": 0.1478, "num_input_tokens_seen": 996992, "step": 5225 }, { "epoch": 2.7182952182952183, "grad_norm": 0.06913677603006363, "learning_rate": 4.980408219201983e-05, "loss": 0.1712, "num_input_tokens_seen": 997984, "step": 5230 }, { "epoch": 2.720893970893971, "grad_norm": 0.4114558696746826, "learning_rate": 4.980266283028856e-05, "loss": 0.1725, "num_input_tokens_seen": 998912, "step": 5235 }, { "epoch": 2.7234927234927238, "grad_norm": 0.784984827041626, "learning_rate": 4.9801238366058506e-05, "loss": 0.0488, "num_input_tokens_seen": 999872, "step": 5240 }, { "epoch": 2.726091476091476, "grad_norm": 0.21286165714263916, "learning_rate": 4.97998087996227e-05, "loss": 0.3003, "num_input_tokens_seen": 1000832, "step": 5245 }, { "epoch": 2.728690228690229, "grad_norm": 0.39871397614479065, "learning_rate": 4.979837413127525e-05, "loss": 0.0842, "num_input_tokens_seen": 1001760, "step": 5250 }, { "epoch": 2.7312889812889813, "grad_norm": 0.1931775063276291, "learning_rate": 4.97969343613113e-05, "loss": 0.1277, "num_input_tokens_seen": 1002752, "step": 5255 }, { "epoch": 2.733887733887734, "grad_norm": 0.31788694858551025, "learning_rate": 4.979548949002703e-05, "loss": 0.1899, "num_input_tokens_seen": 1003680, "step": 5260 }, { "epoch": 2.7364864864864864, "grad_norm": 0.48979681730270386, "learning_rate": 4.97940395177197e-05, "loss": 0.2061, "num_input_tokens_seen": 1004576, "step": 5265 }, { "epoch": 2.739085239085239, "grad_norm": 1.16507887840271, "learning_rate": 4.9792584444687606e-05, "loss": 0.1199, "num_input_tokens_seen": 1005504, "step": 5270 }, { "epoch": 2.741683991683992, "grad_norm": 0.7942014336585999, "learning_rate": 4.9791124271230084e-05, "loss": 0.168, "num_input_tokens_seen": 1006400, "step": 5275 }, { "epoch": 2.7442827442827444, "grad_norm": 0.20539332926273346, "learning_rate": 4.978965899764752e-05, "loss": 0.2123, "num_input_tokens_seen": 1007328, "step": 5280 }, { "epoch": 2.746881496881497, "grad_norm": 0.29739025235176086, "learning_rate": 4.9788188624241374e-05, "loss": 0.2475, "num_input_tokens_seen": 1008320, "step": 5285 }, { "epoch": 2.7494802494802495, "grad_norm": 0.7227691411972046, "learning_rate": 4.978671315131411e-05, "loss": 0.1762, "num_input_tokens_seen": 1009216, "step": 5290 }, { "epoch": 2.752079002079002, "grad_norm": 0.23358982801437378, "learning_rate": 4.97852325791693e-05, "loss": 0.1238, "num_input_tokens_seen": 1010208, "step": 5295 }, { "epoch": 2.7546777546777546, "grad_norm": 0.5255149006843567, "learning_rate": 4.978374690811151e-05, "loss": 0.1614, "num_input_tokens_seen": 1011232, "step": 5300 }, { "epoch": 2.757276507276507, "grad_norm": 0.8581667542457581, "learning_rate": 4.978225613844639e-05, "loss": 0.1342, "num_input_tokens_seen": 1012128, "step": 5305 }, { "epoch": 2.75987525987526, "grad_norm": 0.3331887125968933, "learning_rate": 4.978076027048062e-05, "loss": 0.1694, "num_input_tokens_seen": 1013056, "step": 5310 }, { "epoch": 2.7624740124740126, "grad_norm": 0.6947991251945496, "learning_rate": 4.977925930452193e-05, "loss": 0.1538, "num_input_tokens_seen": 1014016, "step": 5315 }, { "epoch": 2.765072765072765, "grad_norm": 0.3834133744239807, "learning_rate": 4.977775324087912e-05, "loss": 0.1087, "num_input_tokens_seen": 1014944, "step": 5320 }, { "epoch": 2.7676715176715176, "grad_norm": 0.3305242657661438, "learning_rate": 4.9776242079862015e-05, "loss": 0.1559, "num_input_tokens_seen": 1016000, "step": 5325 }, { "epoch": 2.77027027027027, "grad_norm": 0.6148440837860107, "learning_rate": 4.97747258217815e-05, "loss": 0.1411, "num_input_tokens_seen": 1016960, "step": 5330 }, { "epoch": 2.7728690228690227, "grad_norm": 0.2334194779396057, "learning_rate": 4.977320446694949e-05, "loss": 0.1933, "num_input_tokens_seen": 1017888, "step": 5335 }, { "epoch": 2.7754677754677752, "grad_norm": 0.7802174687385559, "learning_rate": 4.977167801567899e-05, "loss": 0.1645, "num_input_tokens_seen": 1018944, "step": 5340 }, { "epoch": 2.778066528066528, "grad_norm": 0.27332574129104614, "learning_rate": 4.977014646828401e-05, "loss": 0.0992, "num_input_tokens_seen": 1020000, "step": 5345 }, { "epoch": 2.7806652806652807, "grad_norm": 1.3905291557312012, "learning_rate": 4.976860982507962e-05, "loss": 0.1759, "num_input_tokens_seen": 1020992, "step": 5350 }, { "epoch": 2.7832640332640333, "grad_norm": 0.6145921945571899, "learning_rate": 4.9767068086381964e-05, "loss": 0.1259, "num_input_tokens_seen": 1021888, "step": 5355 }, { "epoch": 2.785862785862786, "grad_norm": 0.49112606048583984, "learning_rate": 4.97655212525082e-05, "loss": 0.0844, "num_input_tokens_seen": 1022816, "step": 5360 }, { "epoch": 2.7884615384615383, "grad_norm": 0.6280801892280579, "learning_rate": 4.976396932377656e-05, "loss": 0.1348, "num_input_tokens_seen": 1023904, "step": 5365 }, { "epoch": 2.7910602910602913, "grad_norm": 0.5841330885887146, "learning_rate": 4.97624123005063e-05, "loss": 0.106, "num_input_tokens_seen": 1024864, "step": 5370 }, { "epoch": 2.7936590436590434, "grad_norm": 0.6014730334281921, "learning_rate": 4.9760850183017734e-05, "loss": 0.1731, "num_input_tokens_seen": 1025824, "step": 5375 }, { "epoch": 2.7962577962577964, "grad_norm": 0.6754744648933411, "learning_rate": 4.9759282971632247e-05, "loss": 0.1196, "num_input_tokens_seen": 1026720, "step": 5380 }, { "epoch": 2.798856548856549, "grad_norm": 0.3507795035839081, "learning_rate": 4.975771066667224e-05, "loss": 0.1045, "num_input_tokens_seen": 1027648, "step": 5385 }, { "epoch": 2.8014553014553014, "grad_norm": 0.48223787546157837, "learning_rate": 4.975613326846117e-05, "loss": 0.1744, "num_input_tokens_seen": 1028544, "step": 5390 }, { "epoch": 2.804054054054054, "grad_norm": 0.2790639400482178, "learning_rate": 4.975455077732354e-05, "loss": 0.2805, "num_input_tokens_seen": 1029536, "step": 5395 }, { "epoch": 2.8066528066528065, "grad_norm": 0.48346397280693054, "learning_rate": 4.975296319358493e-05, "loss": 0.1449, "num_input_tokens_seen": 1030592, "step": 5400 }, { "epoch": 2.8092515592515594, "grad_norm": 0.6338891386985779, "learning_rate": 4.975137051757192e-05, "loss": 0.0886, "num_input_tokens_seen": 1031520, "step": 5405 }, { "epoch": 2.811850311850312, "grad_norm": 0.8979053497314453, "learning_rate": 4.974977274961217e-05, "loss": 0.1779, "num_input_tokens_seen": 1032480, "step": 5410 }, { "epoch": 2.8144490644490645, "grad_norm": 1.0714260339736938, "learning_rate": 4.974816989003438e-05, "loss": 0.2005, "num_input_tokens_seen": 1033344, "step": 5415 }, { "epoch": 2.817047817047817, "grad_norm": 0.3874751627445221, "learning_rate": 4.97465619391683e-05, "loss": 0.0943, "num_input_tokens_seen": 1034336, "step": 5420 }, { "epoch": 2.8196465696465696, "grad_norm": 0.2922937572002411, "learning_rate": 4.974494889734471e-05, "loss": 0.0739, "num_input_tokens_seen": 1035200, "step": 5425 }, { "epoch": 2.822245322245322, "grad_norm": 1.1453090906143188, "learning_rate": 4.974333076489547e-05, "loss": 0.1559, "num_input_tokens_seen": 1036160, "step": 5430 }, { "epoch": 2.8248440748440746, "grad_norm": 0.8781421184539795, "learning_rate": 4.974170754215345e-05, "loss": 0.3515, "num_input_tokens_seen": 1037152, "step": 5435 }, { "epoch": 2.8274428274428276, "grad_norm": 0.913190484046936, "learning_rate": 4.974007922945261e-05, "loss": 0.1106, "num_input_tokens_seen": 1038112, "step": 5440 }, { "epoch": 2.83004158004158, "grad_norm": 0.6923695802688599, "learning_rate": 4.97384458271279e-05, "loss": 0.217, "num_input_tokens_seen": 1039008, "step": 5445 }, { "epoch": 2.8326403326403327, "grad_norm": 0.45894357562065125, "learning_rate": 4.973680733551538e-05, "loss": 0.1266, "num_input_tokens_seen": 1039968, "step": 5450 }, { "epoch": 2.835239085239085, "grad_norm": 0.13470742106437683, "learning_rate": 4.97351637549521e-05, "loss": 0.1617, "num_input_tokens_seen": 1040928, "step": 5455 }, { "epoch": 2.8378378378378377, "grad_norm": 0.9881826639175415, "learning_rate": 4.97335150857762e-05, "loss": 0.0938, "num_input_tokens_seen": 1041824, "step": 5460 }, { "epoch": 2.8404365904365907, "grad_norm": 0.6474810838699341, "learning_rate": 4.9731861328326854e-05, "loss": 0.1354, "num_input_tokens_seen": 1042784, "step": 5465 }, { "epoch": 2.8430353430353428, "grad_norm": 0.481431245803833, "learning_rate": 4.973020248294428e-05, "loss": 0.113, "num_input_tokens_seen": 1043648, "step": 5470 }, { "epoch": 2.8456340956340958, "grad_norm": 1.0423133373260498, "learning_rate": 4.9728538549969725e-05, "loss": 0.1164, "num_input_tokens_seen": 1044576, "step": 5475 }, { "epoch": 2.8482328482328483, "grad_norm": 0.5092892646789551, "learning_rate": 4.972686952974551e-05, "loss": 0.1222, "num_input_tokens_seen": 1045504, "step": 5480 }, { "epoch": 2.850831600831601, "grad_norm": 1.7725508213043213, "learning_rate": 4.9725195422614996e-05, "loss": 0.2992, "num_input_tokens_seen": 1046432, "step": 5485 }, { "epoch": 2.8534303534303533, "grad_norm": 0.47089266777038574, "learning_rate": 4.9723516228922576e-05, "loss": 0.1933, "num_input_tokens_seen": 1047328, "step": 5490 }, { "epoch": 2.856029106029106, "grad_norm": 0.6659146547317505, "learning_rate": 4.9721831949013715e-05, "loss": 0.1601, "num_input_tokens_seen": 1048288, "step": 5495 }, { "epoch": 2.858627858627859, "grad_norm": 1.0292398929595947, "learning_rate": 4.97201425832349e-05, "loss": 0.2671, "num_input_tokens_seen": 1049344, "step": 5500 }, { "epoch": 2.8612266112266114, "grad_norm": 0.26357197761535645, "learning_rate": 4.971844813193367e-05, "loss": 0.227, "num_input_tokens_seen": 1050336, "step": 5505 }, { "epoch": 2.863825363825364, "grad_norm": 0.28280478715896606, "learning_rate": 4.971674859545863e-05, "loss": 0.0417, "num_input_tokens_seen": 1051200, "step": 5510 }, { "epoch": 2.8664241164241164, "grad_norm": 0.417657732963562, "learning_rate": 4.971504397415939e-05, "loss": 0.0975, "num_input_tokens_seen": 1052160, "step": 5515 }, { "epoch": 2.869022869022869, "grad_norm": 0.7911060452461243, "learning_rate": 4.9713334268386655e-05, "loss": 0.224, "num_input_tokens_seen": 1053184, "step": 5520 }, { "epoch": 2.8716216216216215, "grad_norm": 0.13593554496765137, "learning_rate": 4.971161947849214e-05, "loss": 0.1662, "num_input_tokens_seen": 1054112, "step": 5525 }, { "epoch": 2.874220374220374, "grad_norm": 0.46435925364494324, "learning_rate": 4.970989960482863e-05, "loss": 0.1514, "num_input_tokens_seen": 1055104, "step": 5530 }, { "epoch": 2.876819126819127, "grad_norm": 0.5249483585357666, "learning_rate": 4.970817464774993e-05, "loss": 0.0581, "num_input_tokens_seen": 1056032, "step": 5535 }, { "epoch": 2.8794178794178795, "grad_norm": 0.4556431472301483, "learning_rate": 4.97064446076109e-05, "loss": 0.161, "num_input_tokens_seen": 1056992, "step": 5540 }, { "epoch": 2.882016632016632, "grad_norm": 0.4683058559894562, "learning_rate": 4.9704709484767476e-05, "loss": 0.2313, "num_input_tokens_seen": 1057952, "step": 5545 }, { "epoch": 2.8846153846153846, "grad_norm": 0.394188791513443, "learning_rate": 4.9702969279576584e-05, "loss": 0.1469, "num_input_tokens_seen": 1058880, "step": 5550 }, { "epoch": 2.887214137214137, "grad_norm": 0.5913133025169373, "learning_rate": 4.970122399239625e-05, "loss": 0.1446, "num_input_tokens_seen": 1059840, "step": 5555 }, { "epoch": 2.88981288981289, "grad_norm": 0.8728612065315247, "learning_rate": 4.969947362358551e-05, "loss": 0.1688, "num_input_tokens_seen": 1060896, "step": 5560 }, { "epoch": 2.892411642411642, "grad_norm": 0.49863138794898987, "learning_rate": 4.969771817350445e-05, "loss": 0.1804, "num_input_tokens_seen": 1061824, "step": 5565 }, { "epoch": 2.895010395010395, "grad_norm": 0.6123398542404175, "learning_rate": 4.969595764251422e-05, "loss": 0.1543, "num_input_tokens_seen": 1062816, "step": 5570 }, { "epoch": 2.8976091476091477, "grad_norm": 1.4624290466308594, "learning_rate": 4.9694192030977005e-05, "loss": 0.1506, "num_input_tokens_seen": 1063712, "step": 5575 }, { "epoch": 2.9002079002079, "grad_norm": 1.4939957857131958, "learning_rate": 4.9692421339256024e-05, "loss": 0.3542, "num_input_tokens_seen": 1064608, "step": 5580 }, { "epoch": 2.9028066528066527, "grad_norm": 0.2584434151649475, "learning_rate": 4.969064556771556e-05, "loss": 0.0584, "num_input_tokens_seen": 1065536, "step": 5585 }, { "epoch": 2.9054054054054053, "grad_norm": 0.5646353960037231, "learning_rate": 4.968886471672091e-05, "loss": 0.0913, "num_input_tokens_seen": 1066464, "step": 5590 }, { "epoch": 2.9080041580041582, "grad_norm": 0.33663201332092285, "learning_rate": 4.968707878663846e-05, "loss": 0.1945, "num_input_tokens_seen": 1067424, "step": 5595 }, { "epoch": 2.9106029106029108, "grad_norm": 0.5457217693328857, "learning_rate": 4.9685287777835615e-05, "loss": 0.1321, "num_input_tokens_seen": 1068320, "step": 5600 }, { "epoch": 2.9132016632016633, "grad_norm": 0.9213875532150269, "learning_rate": 4.9683491690680824e-05, "loss": 0.1385, "num_input_tokens_seen": 1069248, "step": 5605 }, { "epoch": 2.915800415800416, "grad_norm": 0.9448775053024292, "learning_rate": 4.9681690525543586e-05, "loss": 0.2581, "num_input_tokens_seen": 1070112, "step": 5610 }, { "epoch": 2.9183991683991684, "grad_norm": 0.43289709091186523, "learning_rate": 4.967988428279444e-05, "loss": 0.2438, "num_input_tokens_seen": 1071008, "step": 5615 }, { "epoch": 2.920997920997921, "grad_norm": 0.273540198802948, "learning_rate": 4.9678072962804975e-05, "loss": 0.1106, "num_input_tokens_seen": 1071936, "step": 5620 }, { "epoch": 2.9235966735966734, "grad_norm": 0.629437267780304, "learning_rate": 4.967625656594782e-05, "loss": 0.1234, "num_input_tokens_seen": 1072896, "step": 5625 }, { "epoch": 2.9261954261954264, "grad_norm": 0.5272436738014221, "learning_rate": 4.967443509259666e-05, "loss": 0.1635, "num_input_tokens_seen": 1073856, "step": 5630 }, { "epoch": 2.928794178794179, "grad_norm": 0.375731498003006, "learning_rate": 4.967260854312622e-05, "loss": 0.1462, "num_input_tokens_seen": 1074816, "step": 5635 }, { "epoch": 2.9313929313929314, "grad_norm": 0.32181960344314575, "learning_rate": 4.9670776917912244e-05, "loss": 0.1422, "num_input_tokens_seen": 1075808, "step": 5640 }, { "epoch": 2.933991683991684, "grad_norm": 0.5734915137290955, "learning_rate": 4.966894021733155e-05, "loss": 0.156, "num_input_tokens_seen": 1076672, "step": 5645 }, { "epoch": 2.9365904365904365, "grad_norm": 0.11876768618822098, "learning_rate": 4.9667098441761996e-05, "loss": 0.0697, "num_input_tokens_seen": 1077600, "step": 5650 }, { "epoch": 2.939189189189189, "grad_norm": 0.12467844039201736, "learning_rate": 4.966525159158248e-05, "loss": 0.0742, "num_input_tokens_seen": 1078560, "step": 5655 }, { "epoch": 2.9417879417879416, "grad_norm": 1.13033127784729, "learning_rate": 4.966339966717294e-05, "loss": 0.2105, "num_input_tokens_seen": 1079552, "step": 5660 }, { "epoch": 2.9443866943866945, "grad_norm": 0.9842479228973389, "learning_rate": 4.966154266891435e-05, "loss": 0.078, "num_input_tokens_seen": 1080480, "step": 5665 }, { "epoch": 2.946985446985447, "grad_norm": 0.2924063801765442, "learning_rate": 4.965968059718876e-05, "loss": 0.2176, "num_input_tokens_seen": 1081440, "step": 5670 }, { "epoch": 2.9495841995841996, "grad_norm": 0.3507138788700104, "learning_rate": 4.965781345237923e-05, "loss": 0.1398, "num_input_tokens_seen": 1082400, "step": 5675 }, { "epoch": 2.952182952182952, "grad_norm": 0.192491352558136, "learning_rate": 4.9655941234869875e-05, "loss": 0.1534, "num_input_tokens_seen": 1083328, "step": 5680 }, { "epoch": 2.9547817047817047, "grad_norm": 0.49259960651397705, "learning_rate": 4.965406394504586e-05, "loss": 0.1119, "num_input_tokens_seen": 1084288, "step": 5685 }, { "epoch": 2.9573804573804576, "grad_norm": 0.6880152225494385, "learning_rate": 4.9652181583293376e-05, "loss": 0.1686, "num_input_tokens_seen": 1085216, "step": 5690 }, { "epoch": 2.9599792099792097, "grad_norm": 0.7920842170715332, "learning_rate": 4.9650294149999694e-05, "loss": 0.1949, "num_input_tokens_seen": 1086208, "step": 5695 }, { "epoch": 2.9625779625779627, "grad_norm": 0.7129809260368347, "learning_rate": 4.964840164555308e-05, "loss": 0.1691, "num_input_tokens_seen": 1087200, "step": 5700 }, { "epoch": 2.965176715176715, "grad_norm": 0.48484915494918823, "learning_rate": 4.964650407034288e-05, "loss": 0.1549, "num_input_tokens_seen": 1088160, "step": 5705 }, { "epoch": 2.9677754677754677, "grad_norm": 0.6074535250663757, "learning_rate": 4.9644601424759476e-05, "loss": 0.2233, "num_input_tokens_seen": 1089056, "step": 5710 }, { "epoch": 2.9703742203742203, "grad_norm": 0.3754618167877197, "learning_rate": 4.9642693709194265e-05, "loss": 0.2052, "num_input_tokens_seen": 1089984, "step": 5715 }, { "epoch": 2.972972972972973, "grad_norm": 0.1865776926279068, "learning_rate": 4.9640780924039734e-05, "loss": 0.2053, "num_input_tokens_seen": 1090944, "step": 5720 }, { "epoch": 2.975571725571726, "grad_norm": 0.3772616386413574, "learning_rate": 4.963886306968937e-05, "loss": 0.0831, "num_input_tokens_seen": 1091904, "step": 5725 }, { "epoch": 2.9781704781704783, "grad_norm": 0.32307180762290955, "learning_rate": 4.963694014653773e-05, "loss": 0.0475, "num_input_tokens_seen": 1092896, "step": 5730 }, { "epoch": 2.980769230769231, "grad_norm": 0.9583224654197693, "learning_rate": 4.9635012154980407e-05, "loss": 0.2581, "num_input_tokens_seen": 1094016, "step": 5735 }, { "epoch": 2.9833679833679834, "grad_norm": 0.7176981568336487, "learning_rate": 4.963307909541403e-05, "loss": 0.2234, "num_input_tokens_seen": 1094912, "step": 5740 }, { "epoch": 2.985966735966736, "grad_norm": 0.1590694785118103, "learning_rate": 4.963114096823628e-05, "loss": 0.1392, "num_input_tokens_seen": 1095872, "step": 5745 }, { "epoch": 2.9885654885654884, "grad_norm": 0.04692073538899422, "learning_rate": 4.962919777384587e-05, "loss": 0.0989, "num_input_tokens_seen": 1096864, "step": 5750 }, { "epoch": 2.991164241164241, "grad_norm": 0.8834941983222961, "learning_rate": 4.9627249512642573e-05, "loss": 0.1485, "num_input_tokens_seen": 1097760, "step": 5755 }, { "epoch": 2.993762993762994, "grad_norm": 0.8499147891998291, "learning_rate": 4.962529618502718e-05, "loss": 0.0916, "num_input_tokens_seen": 1098656, "step": 5760 }, { "epoch": 2.9963617463617465, "grad_norm": 0.4418126344680786, "learning_rate": 4.9623337791401535e-05, "loss": 0.1105, "num_input_tokens_seen": 1099616, "step": 5765 }, { "epoch": 2.998960498960499, "grad_norm": 0.5376678705215454, "learning_rate": 4.962137433216854e-05, "loss": 0.1305, "num_input_tokens_seen": 1100512, "step": 5770 }, { "epoch": 3.0, "eval_loss": 0.1597965657711029, "eval_runtime": 8.0161, "eval_samples_per_second": 106.785, "eval_steps_per_second": 26.696, "num_input_tokens_seen": 1100824, "step": 5772 }, { "epoch": 3.0015592515592515, "grad_norm": 0.26059064269065857, "learning_rate": 4.9619405807732125e-05, "loss": 0.2038, "num_input_tokens_seen": 1101368, "step": 5775 }, { "epoch": 3.004158004158004, "grad_norm": 0.21463404595851898, "learning_rate": 4.961743221849724e-05, "loss": 0.1207, "num_input_tokens_seen": 1102296, "step": 5780 }, { "epoch": 3.0067567567567566, "grad_norm": 0.5925827026367188, "learning_rate": 4.9615453564869926e-05, "loss": 0.1196, "num_input_tokens_seen": 1103224, "step": 5785 }, { "epoch": 3.0093555093555096, "grad_norm": 0.9120975732803345, "learning_rate": 4.961346984725722e-05, "loss": 0.2317, "num_input_tokens_seen": 1104152, "step": 5790 }, { "epoch": 3.011954261954262, "grad_norm": 0.3537411391735077, "learning_rate": 4.9611481066067234e-05, "loss": 0.1748, "num_input_tokens_seen": 1105080, "step": 5795 }, { "epoch": 3.0145530145530146, "grad_norm": 0.8014914989471436, "learning_rate": 4.96094872217091e-05, "loss": 0.2395, "num_input_tokens_seen": 1106040, "step": 5800 }, { "epoch": 3.017151767151767, "grad_norm": 0.7314131259918213, "learning_rate": 4.9607488314593006e-05, "loss": 0.144, "num_input_tokens_seen": 1106936, "step": 5805 }, { "epoch": 3.0197505197505197, "grad_norm": 0.7925615310668945, "learning_rate": 4.960548434513016e-05, "loss": 0.1186, "num_input_tokens_seen": 1107864, "step": 5810 }, { "epoch": 3.022349272349272, "grad_norm": 0.3968326449394226, "learning_rate": 4.9603475313732844e-05, "loss": 0.1954, "num_input_tokens_seen": 1108824, "step": 5815 }, { "epoch": 3.024948024948025, "grad_norm": 0.06574611365795135, "learning_rate": 4.960146122081436e-05, "loss": 0.0472, "num_input_tokens_seen": 1109720, "step": 5820 }, { "epoch": 3.0275467775467777, "grad_norm": 0.2570836544036865, "learning_rate": 4.9599442066789035e-05, "loss": 0.1089, "num_input_tokens_seen": 1110648, "step": 5825 }, { "epoch": 3.0301455301455302, "grad_norm": 0.28559452295303345, "learning_rate": 4.9597417852072285e-05, "loss": 0.138, "num_input_tokens_seen": 1111544, "step": 5830 }, { "epoch": 3.0327442827442828, "grad_norm": 0.8061383962631226, "learning_rate": 4.959538857708053e-05, "loss": 0.1429, "num_input_tokens_seen": 1112536, "step": 5835 }, { "epoch": 3.0353430353430353, "grad_norm": 0.5269327759742737, "learning_rate": 4.959335424223123e-05, "loss": 0.1699, "num_input_tokens_seen": 1113464, "step": 5840 }, { "epoch": 3.037941787941788, "grad_norm": 0.6623901724815369, "learning_rate": 4.9591314847942904e-05, "loss": 0.1441, "num_input_tokens_seen": 1114360, "step": 5845 }, { "epoch": 3.0405405405405403, "grad_norm": 1.2379485368728638, "learning_rate": 4.9589270394635104e-05, "loss": 0.1484, "num_input_tokens_seen": 1115352, "step": 5850 }, { "epoch": 3.0431392931392933, "grad_norm": 1.3337010145187378, "learning_rate": 4.958722088272843e-05, "loss": 0.1958, "num_input_tokens_seen": 1116376, "step": 5855 }, { "epoch": 3.045738045738046, "grad_norm": 0.03862304985523224, "learning_rate": 4.958516631264449e-05, "loss": 0.143, "num_input_tokens_seen": 1117304, "step": 5860 }, { "epoch": 3.0483367983367984, "grad_norm": 0.8547875881195068, "learning_rate": 4.958310668480599e-05, "loss": 0.2293, "num_input_tokens_seen": 1118200, "step": 5865 }, { "epoch": 3.050935550935551, "grad_norm": 0.20818234980106354, "learning_rate": 4.958104199963663e-05, "loss": 0.1402, "num_input_tokens_seen": 1119160, "step": 5870 }, { "epoch": 3.0535343035343034, "grad_norm": 1.4203813076019287, "learning_rate": 4.957897225756116e-05, "loss": 0.2083, "num_input_tokens_seen": 1120056, "step": 5875 }, { "epoch": 3.056133056133056, "grad_norm": 0.15914605557918549, "learning_rate": 4.957689745900539e-05, "loss": 0.1043, "num_input_tokens_seen": 1121016, "step": 5880 }, { "epoch": 3.058731808731809, "grad_norm": 0.06624385714530945, "learning_rate": 4.957481760439613e-05, "loss": 0.1457, "num_input_tokens_seen": 1122008, "step": 5885 }, { "epoch": 3.0613305613305615, "grad_norm": 0.5681526064872742, "learning_rate": 4.957273269416128e-05, "loss": 0.269, "num_input_tokens_seen": 1122968, "step": 5890 }, { "epoch": 3.063929313929314, "grad_norm": 1.1037936210632324, "learning_rate": 4.957064272872975e-05, "loss": 0.2023, "num_input_tokens_seen": 1123960, "step": 5895 }, { "epoch": 3.0665280665280665, "grad_norm": 0.19638842344284058, "learning_rate": 4.9568547708531487e-05, "loss": 0.0877, "num_input_tokens_seen": 1124920, "step": 5900 }, { "epoch": 3.069126819126819, "grad_norm": 0.5266339182853699, "learning_rate": 4.9566447633997495e-05, "loss": 0.1416, "num_input_tokens_seen": 1125912, "step": 5905 }, { "epoch": 3.0717255717255716, "grad_norm": 0.3351559638977051, "learning_rate": 4.956434250555981e-05, "loss": 0.218, "num_input_tokens_seen": 1126872, "step": 5910 }, { "epoch": 3.074324324324324, "grad_norm": 0.21729294955730438, "learning_rate": 4.9562232323651505e-05, "loss": 0.1676, "num_input_tokens_seen": 1127832, "step": 5915 }, { "epoch": 3.076923076923077, "grad_norm": 0.6092172861099243, "learning_rate": 4.956011708870669e-05, "loss": 0.3318, "num_input_tokens_seen": 1128728, "step": 5920 }, { "epoch": 3.0795218295218296, "grad_norm": 0.26267707347869873, "learning_rate": 4.955799680116052e-05, "loss": 0.1044, "num_input_tokens_seen": 1129720, "step": 5925 }, { "epoch": 3.082120582120582, "grad_norm": 1.6035045385360718, "learning_rate": 4.95558714614492e-05, "loss": 0.2538, "num_input_tokens_seen": 1130744, "step": 5930 }, { "epoch": 3.0847193347193347, "grad_norm": 0.5426732897758484, "learning_rate": 4.955374107000995e-05, "loss": 0.1656, "num_input_tokens_seen": 1131704, "step": 5935 }, { "epoch": 3.087318087318087, "grad_norm": 0.8580184578895569, "learning_rate": 4.955160562728105e-05, "loss": 0.1615, "num_input_tokens_seen": 1132664, "step": 5940 }, { "epoch": 3.0899168399168397, "grad_norm": 0.6278625130653381, "learning_rate": 4.95494651337018e-05, "loss": 0.2665, "num_input_tokens_seen": 1133560, "step": 5945 }, { "epoch": 3.0925155925155927, "grad_norm": 0.3576502501964569, "learning_rate": 4.954731958971257e-05, "loss": 0.1018, "num_input_tokens_seen": 1134488, "step": 5950 }, { "epoch": 3.0951143451143452, "grad_norm": 0.15389719605445862, "learning_rate": 4.954516899575473e-05, "loss": 0.2056, "num_input_tokens_seen": 1135384, "step": 5955 }, { "epoch": 3.0977130977130978, "grad_norm": 0.9875360131263733, "learning_rate": 4.9543013352270715e-05, "loss": 0.1209, "num_input_tokens_seen": 1136280, "step": 5960 }, { "epoch": 3.1003118503118503, "grad_norm": 0.36524856090545654, "learning_rate": 4.954085265970401e-05, "loss": 0.1725, "num_input_tokens_seen": 1137208, "step": 5965 }, { "epoch": 3.102910602910603, "grad_norm": 0.7527374029159546, "learning_rate": 4.953868691849909e-05, "loss": 0.1191, "num_input_tokens_seen": 1138104, "step": 5970 }, { "epoch": 3.1055093555093554, "grad_norm": 0.5291557312011719, "learning_rate": 4.9536516129101516e-05, "loss": 0.0983, "num_input_tokens_seen": 1139032, "step": 5975 }, { "epoch": 3.108108108108108, "grad_norm": 0.5400021076202393, "learning_rate": 4.953434029195788e-05, "loss": 0.188, "num_input_tokens_seen": 1139992, "step": 5980 }, { "epoch": 3.110706860706861, "grad_norm": 0.37639477849006653, "learning_rate": 4.9532159407515785e-05, "loss": 0.0938, "num_input_tokens_seen": 1140856, "step": 5985 }, { "epoch": 3.1133056133056134, "grad_norm": 0.9694490432739258, "learning_rate": 4.952997347622389e-05, "loss": 0.1814, "num_input_tokens_seen": 1141784, "step": 5990 }, { "epoch": 3.115904365904366, "grad_norm": 0.17461927235126495, "learning_rate": 4.9527782498531915e-05, "loss": 0.1409, "num_input_tokens_seen": 1142712, "step": 5995 }, { "epoch": 3.1185031185031185, "grad_norm": 0.2567225694656372, "learning_rate": 4.952558647489058e-05, "loss": 0.2441, "num_input_tokens_seen": 1143704, "step": 6000 }, { "epoch": 3.121101871101871, "grad_norm": 0.7505885362625122, "learning_rate": 4.952338540575167e-05, "loss": 0.1164, "num_input_tokens_seen": 1144600, "step": 6005 }, { "epoch": 3.1237006237006235, "grad_norm": 0.7190178036689758, "learning_rate": 4.952117929156799e-05, "loss": 0.1172, "num_input_tokens_seen": 1145496, "step": 6010 }, { "epoch": 3.1262993762993765, "grad_norm": 1.0545862913131714, "learning_rate": 4.951896813279338e-05, "loss": 0.1813, "num_input_tokens_seen": 1146488, "step": 6015 }, { "epoch": 3.128898128898129, "grad_norm": 0.8656798601150513, "learning_rate": 4.9516751929882743e-05, "loss": 0.1212, "num_input_tokens_seen": 1147512, "step": 6020 }, { "epoch": 3.1314968814968815, "grad_norm": 0.3883703649044037, "learning_rate": 4.9514530683292e-05, "loss": 0.1151, "num_input_tokens_seen": 1148504, "step": 6025 }, { "epoch": 3.134095634095634, "grad_norm": 0.48660388588905334, "learning_rate": 4.9512304393478125e-05, "loss": 0.112, "num_input_tokens_seen": 1149432, "step": 6030 }, { "epoch": 3.1366943866943866, "grad_norm": 0.18311914801597595, "learning_rate": 4.95100730608991e-05, "loss": 0.0924, "num_input_tokens_seen": 1150424, "step": 6035 }, { "epoch": 3.139293139293139, "grad_norm": 0.6854307651519775, "learning_rate": 4.9507836686013975e-05, "loss": 0.1166, "num_input_tokens_seen": 1151384, "step": 6040 }, { "epoch": 3.141891891891892, "grad_norm": 0.04061078652739525, "learning_rate": 4.9505595269282815e-05, "loss": 0.1477, "num_input_tokens_seen": 1152344, "step": 6045 }, { "epoch": 3.1444906444906446, "grad_norm": 0.24716849625110626, "learning_rate": 4.950334881116675e-05, "loss": 0.1293, "num_input_tokens_seen": 1153272, "step": 6050 }, { "epoch": 3.147089397089397, "grad_norm": 0.4614942669868469, "learning_rate": 4.950109731212792e-05, "loss": 0.1164, "num_input_tokens_seen": 1154168, "step": 6055 }, { "epoch": 3.1496881496881497, "grad_norm": 0.7978526949882507, "learning_rate": 4.949884077262951e-05, "loss": 0.1273, "num_input_tokens_seen": 1155160, "step": 6060 }, { "epoch": 3.1522869022869022, "grad_norm": 0.801846981048584, "learning_rate": 4.949657919313574e-05, "loss": 0.0755, "num_input_tokens_seen": 1156024, "step": 6065 }, { "epoch": 3.1548856548856548, "grad_norm": 0.6174232959747314, "learning_rate": 4.949431257411188e-05, "loss": 0.1939, "num_input_tokens_seen": 1157016, "step": 6070 }, { "epoch": 3.1574844074844073, "grad_norm": 0.43086865544319153, "learning_rate": 4.949204091602423e-05, "loss": 0.0848, "num_input_tokens_seen": 1157912, "step": 6075 }, { "epoch": 3.1600831600831603, "grad_norm": 1.4609079360961914, "learning_rate": 4.948976421934012e-05, "loss": 0.1659, "num_input_tokens_seen": 1158840, "step": 6080 }, { "epoch": 3.162681912681913, "grad_norm": 1.6339364051818848, "learning_rate": 4.948748248452791e-05, "loss": 0.2283, "num_input_tokens_seen": 1159768, "step": 6085 }, { "epoch": 3.1652806652806653, "grad_norm": 0.8549896478652954, "learning_rate": 4.948519571205703e-05, "loss": 0.2957, "num_input_tokens_seen": 1160728, "step": 6090 }, { "epoch": 3.167879417879418, "grad_norm": 0.18246260285377502, "learning_rate": 4.94829039023979e-05, "loss": 0.074, "num_input_tokens_seen": 1161688, "step": 6095 }, { "epoch": 3.1704781704781704, "grad_norm": 0.4972393214702606, "learning_rate": 4.9480607056022015e-05, "loss": 0.1375, "num_input_tokens_seen": 1162616, "step": 6100 }, { "epoch": 3.173076923076923, "grad_norm": 0.21307413280010223, "learning_rate": 4.9478305173401885e-05, "loss": 0.114, "num_input_tokens_seen": 1163576, "step": 6105 }, { "epoch": 3.175675675675676, "grad_norm": 0.5902394652366638, "learning_rate": 4.9475998255011066e-05, "loss": 0.1358, "num_input_tokens_seen": 1164568, "step": 6110 }, { "epoch": 3.1782744282744284, "grad_norm": 0.37314242124557495, "learning_rate": 4.947368630132414e-05, "loss": 0.2593, "num_input_tokens_seen": 1165496, "step": 6115 }, { "epoch": 3.180873180873181, "grad_norm": 0.42918404936790466, "learning_rate": 4.947136931281674e-05, "loss": 0.2391, "num_input_tokens_seen": 1166424, "step": 6120 }, { "epoch": 3.1834719334719335, "grad_norm": 0.2191557139158249, "learning_rate": 4.946904728996551e-05, "loss": 0.1866, "num_input_tokens_seen": 1167416, "step": 6125 }, { "epoch": 3.186070686070686, "grad_norm": 0.38995131850242615, "learning_rate": 4.946672023324817e-05, "loss": 0.1334, "num_input_tokens_seen": 1168376, "step": 6130 }, { "epoch": 3.1886694386694385, "grad_norm": 0.39514490962028503, "learning_rate": 4.9464388143143426e-05, "loss": 0.2204, "num_input_tokens_seen": 1169368, "step": 6135 }, { "epoch": 3.1912681912681915, "grad_norm": 0.49470776319503784, "learning_rate": 4.946205102013106e-05, "loss": 0.0739, "num_input_tokens_seen": 1170264, "step": 6140 }, { "epoch": 3.193866943866944, "grad_norm": 0.19962459802627563, "learning_rate": 4.945970886469187e-05, "loss": 0.1261, "num_input_tokens_seen": 1171160, "step": 6145 }, { "epoch": 3.1964656964656966, "grad_norm": 0.3709084391593933, "learning_rate": 4.945736167730769e-05, "loss": 0.1491, "num_input_tokens_seen": 1172152, "step": 6150 }, { "epoch": 3.199064449064449, "grad_norm": 0.4247031807899475, "learning_rate": 4.945500945846139e-05, "loss": 0.186, "num_input_tokens_seen": 1173144, "step": 6155 }, { "epoch": 3.2016632016632016, "grad_norm": 1.2666043043136597, "learning_rate": 4.945265220863689e-05, "loss": 0.2, "num_input_tokens_seen": 1174136, "step": 6160 }, { "epoch": 3.204261954261954, "grad_norm": 1.1056571006774902, "learning_rate": 4.945028992831913e-05, "loss": 0.1729, "num_input_tokens_seen": 1175064, "step": 6165 }, { "epoch": 3.2068607068607067, "grad_norm": 1.3882415294647217, "learning_rate": 4.944792261799407e-05, "loss": 0.3582, "num_input_tokens_seen": 1176024, "step": 6170 }, { "epoch": 3.2094594594594597, "grad_norm": 1.4335811138153076, "learning_rate": 4.9445550278148736e-05, "loss": 0.2163, "num_input_tokens_seen": 1177048, "step": 6175 }, { "epoch": 3.212058212058212, "grad_norm": 1.297467827796936, "learning_rate": 4.944317290927117e-05, "loss": 0.1352, "num_input_tokens_seen": 1178072, "step": 6180 }, { "epoch": 3.2146569646569647, "grad_norm": 0.9996982216835022, "learning_rate": 4.9440790511850465e-05, "loss": 0.1, "num_input_tokens_seen": 1178968, "step": 6185 }, { "epoch": 3.2172557172557172, "grad_norm": 0.4682959318161011, "learning_rate": 4.9438403086376726e-05, "loss": 0.1133, "num_input_tokens_seen": 1179864, "step": 6190 }, { "epoch": 3.2198544698544698, "grad_norm": 0.22890494763851166, "learning_rate": 4.94360106333411e-05, "loss": 0.2161, "num_input_tokens_seen": 1180824, "step": 6195 }, { "epoch": 3.2224532224532223, "grad_norm": 0.6866891980171204, "learning_rate": 4.943361315323579e-05, "loss": 0.1147, "num_input_tokens_seen": 1181784, "step": 6200 }, { "epoch": 3.225051975051975, "grad_norm": 0.47341522574424744, "learning_rate": 4.9431210646553994e-05, "loss": 0.1176, "num_input_tokens_seen": 1182712, "step": 6205 }, { "epoch": 3.227650727650728, "grad_norm": 0.7243704199790955, "learning_rate": 4.942880311378997e-05, "loss": 0.1848, "num_input_tokens_seen": 1183736, "step": 6210 }, { "epoch": 3.2302494802494803, "grad_norm": 0.32056984305381775, "learning_rate": 4.942639055543902e-05, "loss": 0.1013, "num_input_tokens_seen": 1184600, "step": 6215 }, { "epoch": 3.232848232848233, "grad_norm": 0.5367963314056396, "learning_rate": 4.9423972971997436e-05, "loss": 0.1501, "num_input_tokens_seen": 1185528, "step": 6220 }, { "epoch": 3.2354469854469854, "grad_norm": 0.2393541932106018, "learning_rate": 4.94215503639626e-05, "loss": 0.0999, "num_input_tokens_seen": 1186392, "step": 6225 }, { "epoch": 3.238045738045738, "grad_norm": 1.0694290399551392, "learning_rate": 4.9419122731832885e-05, "loss": 0.1877, "num_input_tokens_seen": 1187352, "step": 6230 }, { "epoch": 3.2406444906444904, "grad_norm": 0.6855499744415283, "learning_rate": 4.9416690076107716e-05, "loss": 0.125, "num_input_tokens_seen": 1188280, "step": 6235 }, { "epoch": 3.2432432432432434, "grad_norm": 0.24914507567882538, "learning_rate": 4.941425239728755e-05, "loss": 0.1756, "num_input_tokens_seen": 1189240, "step": 6240 }, { "epoch": 3.245841995841996, "grad_norm": 1.0616021156311035, "learning_rate": 4.941180969587388e-05, "loss": 0.0758, "num_input_tokens_seen": 1190200, "step": 6245 }, { "epoch": 3.2484407484407485, "grad_norm": 0.6526685953140259, "learning_rate": 4.940936197236922e-05, "loss": 0.1303, "num_input_tokens_seen": 1191160, "step": 6250 }, { "epoch": 3.251039501039501, "grad_norm": 0.7192609310150146, "learning_rate": 4.9406909227277135e-05, "loss": 0.2772, "num_input_tokens_seen": 1192184, "step": 6255 }, { "epoch": 3.2536382536382535, "grad_norm": 0.2266523540019989, "learning_rate": 4.940445146110219e-05, "loss": 0.0494, "num_input_tokens_seen": 1193112, "step": 6260 }, { "epoch": 3.256237006237006, "grad_norm": 0.4249137043952942, "learning_rate": 4.940198867435003e-05, "loss": 0.1696, "num_input_tokens_seen": 1194072, "step": 6265 }, { "epoch": 3.258835758835759, "grad_norm": 0.7420647144317627, "learning_rate": 4.9399520867527304e-05, "loss": 0.1019, "num_input_tokens_seen": 1195064, "step": 6270 }, { "epoch": 3.2614345114345116, "grad_norm": 0.46989479660987854, "learning_rate": 4.939704804114169e-05, "loss": 0.128, "num_input_tokens_seen": 1195992, "step": 6275 }, { "epoch": 3.264033264033264, "grad_norm": 1.6731925010681152, "learning_rate": 4.9394570195701915e-05, "loss": 0.2791, "num_input_tokens_seen": 1196952, "step": 6280 }, { "epoch": 3.2666320166320166, "grad_norm": 0.43598926067352295, "learning_rate": 4.939208733171774e-05, "loss": 0.2213, "num_input_tokens_seen": 1197944, "step": 6285 }, { "epoch": 3.269230769230769, "grad_norm": 0.17714762687683105, "learning_rate": 4.9389599449699925e-05, "loss": 0.1063, "num_input_tokens_seen": 1198936, "step": 6290 }, { "epoch": 3.2718295218295217, "grad_norm": 0.6918197274208069, "learning_rate": 4.93871065501603e-05, "loss": 0.3487, "num_input_tokens_seen": 1199896, "step": 6295 }, { "epoch": 3.274428274428274, "grad_norm": 1.3388686180114746, "learning_rate": 4.938460863361172e-05, "loss": 0.2004, "num_input_tokens_seen": 1200824, "step": 6300 }, { "epoch": 3.277027027027027, "grad_norm": 0.7172056436538696, "learning_rate": 4.938210570056806e-05, "loss": 0.1884, "num_input_tokens_seen": 1201784, "step": 6305 }, { "epoch": 3.2796257796257797, "grad_norm": 0.8301631212234497, "learning_rate": 4.937959775154423e-05, "loss": 0.2934, "num_input_tokens_seen": 1202712, "step": 6310 }, { "epoch": 3.2822245322245323, "grad_norm": 0.49005475640296936, "learning_rate": 4.937708478705617e-05, "loss": 0.1501, "num_input_tokens_seen": 1203736, "step": 6315 }, { "epoch": 3.284823284823285, "grad_norm": 1.4105284214019775, "learning_rate": 4.937456680762087e-05, "loss": 0.2152, "num_input_tokens_seen": 1204664, "step": 6320 }, { "epoch": 3.2874220374220373, "grad_norm": 0.3673138916492462, "learning_rate": 4.9372043813756335e-05, "loss": 0.1136, "num_input_tokens_seen": 1205528, "step": 6325 }, { "epoch": 3.29002079002079, "grad_norm": 0.9468586444854736, "learning_rate": 4.93695158059816e-05, "loss": 0.1792, "num_input_tokens_seen": 1206520, "step": 6330 }, { "epoch": 3.2926195426195424, "grad_norm": 0.40587490797042847, "learning_rate": 4.936698278481674e-05, "loss": 0.1387, "num_input_tokens_seen": 1207512, "step": 6335 }, { "epoch": 3.2952182952182953, "grad_norm": 0.5626506209373474, "learning_rate": 4.936444475078286e-05, "loss": 0.1692, "num_input_tokens_seen": 1208504, "step": 6340 }, { "epoch": 3.297817047817048, "grad_norm": 0.13598257303237915, "learning_rate": 4.936190170440208e-05, "loss": 0.0966, "num_input_tokens_seen": 1209400, "step": 6345 }, { "epoch": 3.3004158004158004, "grad_norm": 0.1940138190984726, "learning_rate": 4.9359353646197585e-05, "loss": 0.1378, "num_input_tokens_seen": 1210360, "step": 6350 }, { "epoch": 3.303014553014553, "grad_norm": 0.9220267534255981, "learning_rate": 4.935680057669356e-05, "loss": 0.2339, "num_input_tokens_seen": 1211320, "step": 6355 }, { "epoch": 3.3056133056133055, "grad_norm": 1.2625051736831665, "learning_rate": 4.935424249641524e-05, "loss": 0.236, "num_input_tokens_seen": 1212216, "step": 6360 }, { "epoch": 3.3082120582120584, "grad_norm": 0.7640487551689148, "learning_rate": 4.935167940588887e-05, "loss": 0.1525, "num_input_tokens_seen": 1213208, "step": 6365 }, { "epoch": 3.310810810810811, "grad_norm": 0.4578562080860138, "learning_rate": 4.934911130564175e-05, "loss": 0.1339, "num_input_tokens_seen": 1214200, "step": 6370 }, { "epoch": 3.3134095634095635, "grad_norm": 0.21945306658744812, "learning_rate": 4.934653819620219e-05, "loss": 0.1609, "num_input_tokens_seen": 1215096, "step": 6375 }, { "epoch": 3.316008316008316, "grad_norm": 1.0328587293624878, "learning_rate": 4.934396007809955e-05, "loss": 0.1031, "num_input_tokens_seen": 1216056, "step": 6380 }, { "epoch": 3.3186070686070686, "grad_norm": 0.347613662481308, "learning_rate": 4.934137695186421e-05, "loss": 0.1026, "num_input_tokens_seen": 1217016, "step": 6385 }, { "epoch": 3.321205821205821, "grad_norm": 0.5630604028701782, "learning_rate": 4.933878881802757e-05, "loss": 0.181, "num_input_tokens_seen": 1217944, "step": 6390 }, { "epoch": 3.3238045738045736, "grad_norm": 0.5856571197509766, "learning_rate": 4.9336195677122085e-05, "loss": 0.1807, "num_input_tokens_seen": 1218872, "step": 6395 }, { "epoch": 3.3264033264033266, "grad_norm": 0.5277001857757568, "learning_rate": 4.933359752968121e-05, "loss": 0.1484, "num_input_tokens_seen": 1219800, "step": 6400 }, { "epoch": 3.329002079002079, "grad_norm": 0.7695392966270447, "learning_rate": 4.933099437623946e-05, "loss": 0.1157, "num_input_tokens_seen": 1220760, "step": 6405 }, { "epoch": 3.3316008316008316, "grad_norm": 0.35270535945892334, "learning_rate": 4.932838621733236e-05, "loss": 0.1464, "num_input_tokens_seen": 1221784, "step": 6410 }, { "epoch": 3.334199584199584, "grad_norm": 1.5485786199569702, "learning_rate": 4.932577305349647e-05, "loss": 0.1993, "num_input_tokens_seen": 1222872, "step": 6415 }, { "epoch": 3.3367983367983367, "grad_norm": 1.081960678100586, "learning_rate": 4.932315488526937e-05, "loss": 0.0866, "num_input_tokens_seen": 1223800, "step": 6420 }, { "epoch": 3.3393970893970892, "grad_norm": 0.14219677448272705, "learning_rate": 4.932053171318971e-05, "loss": 0.1896, "num_input_tokens_seen": 1224760, "step": 6425 }, { "epoch": 3.3419958419958418, "grad_norm": 0.9224475622177124, "learning_rate": 4.931790353779711e-05, "loss": 0.3789, "num_input_tokens_seen": 1225880, "step": 6430 }, { "epoch": 3.3445945945945947, "grad_norm": 0.8629010915756226, "learning_rate": 4.931527035963225e-05, "loss": 0.1609, "num_input_tokens_seen": 1226808, "step": 6435 }, { "epoch": 3.3471933471933473, "grad_norm": 0.800975501537323, "learning_rate": 4.931263217923685e-05, "loss": 0.1127, "num_input_tokens_seen": 1227736, "step": 6440 }, { "epoch": 3.3497920997921, "grad_norm": 0.291362464427948, "learning_rate": 4.930998899715364e-05, "loss": 0.122, "num_input_tokens_seen": 1228696, "step": 6445 }, { "epoch": 3.3523908523908523, "grad_norm": 0.4694916605949402, "learning_rate": 4.930734081392639e-05, "loss": 0.1197, "num_input_tokens_seen": 1229592, "step": 6450 }, { "epoch": 3.354989604989605, "grad_norm": 0.4939940869808197, "learning_rate": 4.930468763009989e-05, "loss": 0.2039, "num_input_tokens_seen": 1230488, "step": 6455 }, { "epoch": 3.357588357588358, "grad_norm": 0.6297093629837036, "learning_rate": 4.930202944621997e-05, "loss": 0.261, "num_input_tokens_seen": 1231512, "step": 6460 }, { "epoch": 3.3601871101871104, "grad_norm": 0.15312384068965912, "learning_rate": 4.929936626283347e-05, "loss": 0.1016, "num_input_tokens_seen": 1232376, "step": 6465 }, { "epoch": 3.362785862785863, "grad_norm": 0.7337888479232788, "learning_rate": 4.929669808048827e-05, "loss": 0.1989, "num_input_tokens_seen": 1233304, "step": 6470 }, { "epoch": 3.3653846153846154, "grad_norm": 0.5358867049217224, "learning_rate": 4.9294024899733296e-05, "loss": 0.1088, "num_input_tokens_seen": 1234232, "step": 6475 }, { "epoch": 3.367983367983368, "grad_norm": 0.29464593529701233, "learning_rate": 4.929134672111847e-05, "loss": 0.1777, "num_input_tokens_seen": 1235096, "step": 6480 }, { "epoch": 3.3705821205821205, "grad_norm": 0.31042787432670593, "learning_rate": 4.928866354519476e-05, "loss": 0.1286, "num_input_tokens_seen": 1236088, "step": 6485 }, { "epoch": 3.373180873180873, "grad_norm": 0.34950894117355347, "learning_rate": 4.928597537251417e-05, "loss": 0.124, "num_input_tokens_seen": 1237144, "step": 6490 }, { "epoch": 3.375779625779626, "grad_norm": 0.5486361384391785, "learning_rate": 4.92832822036297e-05, "loss": 0.1189, "num_input_tokens_seen": 1238168, "step": 6495 }, { "epoch": 3.3783783783783785, "grad_norm": 1.1343339681625366, "learning_rate": 4.928058403909541e-05, "loss": 0.1534, "num_input_tokens_seen": 1239064, "step": 6500 }, { "epoch": 3.380977130977131, "grad_norm": 0.6549786329269409, "learning_rate": 4.9277880879466384e-05, "loss": 0.1125, "num_input_tokens_seen": 1240056, "step": 6505 }, { "epoch": 3.3835758835758836, "grad_norm": 0.12024536728858948, "learning_rate": 4.927517272529872e-05, "loss": 0.1411, "num_input_tokens_seen": 1240984, "step": 6510 }, { "epoch": 3.386174636174636, "grad_norm": 0.17795398831367493, "learning_rate": 4.9272459577149555e-05, "loss": 0.1052, "num_input_tokens_seen": 1241944, "step": 6515 }, { "epoch": 3.3887733887733886, "grad_norm": 0.4714156687259674, "learning_rate": 4.926974143557703e-05, "loss": 0.2192, "num_input_tokens_seen": 1242968, "step": 6520 }, { "epoch": 3.391372141372141, "grad_norm": 0.35422244668006897, "learning_rate": 4.9267018301140356e-05, "loss": 0.1557, "num_input_tokens_seen": 1243832, "step": 6525 }, { "epoch": 3.393970893970894, "grad_norm": 0.4128485321998596, "learning_rate": 4.926429017439973e-05, "loss": 0.1383, "num_input_tokens_seen": 1244856, "step": 6530 }, { "epoch": 3.3965696465696467, "grad_norm": 0.5128244161605835, "learning_rate": 4.92615570559164e-05, "loss": 0.2824, "num_input_tokens_seen": 1245848, "step": 6535 }, { "epoch": 3.399168399168399, "grad_norm": 1.8285105228424072, "learning_rate": 4.9258818946252624e-05, "loss": 0.247, "num_input_tokens_seen": 1246776, "step": 6540 }, { "epoch": 3.4017671517671517, "grad_norm": 0.453112930059433, "learning_rate": 4.925607584597172e-05, "loss": 0.0882, "num_input_tokens_seen": 1247736, "step": 6545 }, { "epoch": 3.4043659043659042, "grad_norm": 0.7031856179237366, "learning_rate": 4.9253327755637976e-05, "loss": 0.2624, "num_input_tokens_seen": 1248728, "step": 6550 }, { "epoch": 3.406964656964657, "grad_norm": 0.7624748349189758, "learning_rate": 4.9250574675816765e-05, "loss": 0.1974, "num_input_tokens_seen": 1249720, "step": 6555 }, { "epoch": 3.4095634095634098, "grad_norm": 1.5161346197128296, "learning_rate": 4.9247816607074446e-05, "loss": 0.1373, "num_input_tokens_seen": 1250616, "step": 6560 }, { "epoch": 3.4121621621621623, "grad_norm": 0.521281898021698, "learning_rate": 4.924505354997844e-05, "loss": 0.0973, "num_input_tokens_seen": 1251608, "step": 6565 }, { "epoch": 3.414760914760915, "grad_norm": 0.14696148037910461, "learning_rate": 4.9242285505097144e-05, "loss": 0.1511, "num_input_tokens_seen": 1252568, "step": 6570 }, { "epoch": 3.4173596673596673, "grad_norm": 1.0696899890899658, "learning_rate": 4.923951247300004e-05, "loss": 0.1665, "num_input_tokens_seen": 1253560, "step": 6575 }, { "epoch": 3.41995841995842, "grad_norm": 0.6246198415756226, "learning_rate": 4.9236734454257585e-05, "loss": 0.2081, "num_input_tokens_seen": 1254584, "step": 6580 }, { "epoch": 3.4225571725571724, "grad_norm": 0.8521220088005066, "learning_rate": 4.9233951449441295e-05, "loss": 0.0784, "num_input_tokens_seen": 1255512, "step": 6585 }, { "epoch": 3.4251559251559254, "grad_norm": 0.8537954092025757, "learning_rate": 4.9231163459123695e-05, "loss": 0.1862, "num_input_tokens_seen": 1256440, "step": 6590 }, { "epoch": 3.427754677754678, "grad_norm": 1.2032396793365479, "learning_rate": 4.922837048387834e-05, "loss": 0.199, "num_input_tokens_seen": 1257368, "step": 6595 }, { "epoch": 3.4303534303534304, "grad_norm": 0.22988316416740417, "learning_rate": 4.922557252427983e-05, "loss": 0.1006, "num_input_tokens_seen": 1258264, "step": 6600 }, { "epoch": 3.432952182952183, "grad_norm": 0.19698287546634674, "learning_rate": 4.922276958090374e-05, "loss": 0.0926, "num_input_tokens_seen": 1259160, "step": 6605 }, { "epoch": 3.4355509355509355, "grad_norm": 0.7510048151016235, "learning_rate": 4.921996165432673e-05, "loss": 0.1342, "num_input_tokens_seen": 1260184, "step": 6610 }, { "epoch": 3.438149688149688, "grad_norm": 1.0521183013916016, "learning_rate": 4.921714874512644e-05, "loss": 0.1033, "num_input_tokens_seen": 1261208, "step": 6615 }, { "epoch": 3.4407484407484406, "grad_norm": 0.3500896990299225, "learning_rate": 4.9214330853881564e-05, "loss": 0.0927, "num_input_tokens_seen": 1262168, "step": 6620 }, { "epoch": 3.4433471933471935, "grad_norm": 1.4176148176193237, "learning_rate": 4.92115079811718e-05, "loss": 0.234, "num_input_tokens_seen": 1263160, "step": 6625 }, { "epoch": 3.445945945945946, "grad_norm": 0.851578950881958, "learning_rate": 4.920868012757789e-05, "loss": 0.1662, "num_input_tokens_seen": 1264152, "step": 6630 }, { "epoch": 3.4485446985446986, "grad_norm": 1.4466139078140259, "learning_rate": 4.920584729368159e-05, "loss": 0.1487, "num_input_tokens_seen": 1265080, "step": 6635 }, { "epoch": 3.451143451143451, "grad_norm": 1.1982238292694092, "learning_rate": 4.920300948006567e-05, "loss": 0.2432, "num_input_tokens_seen": 1266040, "step": 6640 }, { "epoch": 3.4537422037422036, "grad_norm": 0.38094034790992737, "learning_rate": 4.9200166687313945e-05, "loss": 0.1474, "num_input_tokens_seen": 1267032, "step": 6645 }, { "epoch": 3.456340956340956, "grad_norm": 0.8720236420631409, "learning_rate": 4.919731891601124e-05, "loss": 0.1677, "num_input_tokens_seen": 1267992, "step": 6650 }, { "epoch": 3.4589397089397087, "grad_norm": 0.414876788854599, "learning_rate": 4.9194466166743415e-05, "loss": 0.1158, "num_input_tokens_seen": 1268920, "step": 6655 }, { "epoch": 3.4615384615384617, "grad_norm": 0.43789157271385193, "learning_rate": 4.919160844009734e-05, "loss": 0.1584, "num_input_tokens_seen": 1269816, "step": 6660 }, { "epoch": 3.464137214137214, "grad_norm": 1.0744622945785522, "learning_rate": 4.918874573666094e-05, "loss": 0.2659, "num_input_tokens_seen": 1270776, "step": 6665 }, { "epoch": 3.4667359667359667, "grad_norm": 0.49198389053344727, "learning_rate": 4.9185878057023104e-05, "loss": 0.1134, "num_input_tokens_seen": 1271736, "step": 6670 }, { "epoch": 3.4693347193347193, "grad_norm": 0.3584353029727936, "learning_rate": 4.9183005401773816e-05, "loss": 0.1077, "num_input_tokens_seen": 1272696, "step": 6675 }, { "epoch": 3.471933471933472, "grad_norm": 0.5633264183998108, "learning_rate": 4.918012777150403e-05, "loss": 0.1423, "num_input_tokens_seen": 1273592, "step": 6680 }, { "epoch": 3.4745322245322248, "grad_norm": 0.870004415512085, "learning_rate": 4.9177245166805755e-05, "loss": 0.122, "num_input_tokens_seen": 1274584, "step": 6685 }, { "epoch": 3.4771309771309773, "grad_norm": 0.39215362071990967, "learning_rate": 4.9174357588272e-05, "loss": 0.0859, "num_input_tokens_seen": 1275544, "step": 6690 }, { "epoch": 3.47972972972973, "grad_norm": 0.17425312101840973, "learning_rate": 4.917146503649682e-05, "loss": 0.0839, "num_input_tokens_seen": 1276536, "step": 6695 }, { "epoch": 3.4823284823284824, "grad_norm": 0.527694046497345, "learning_rate": 4.916856751207528e-05, "loss": 0.0537, "num_input_tokens_seen": 1277432, "step": 6700 }, { "epoch": 3.484927234927235, "grad_norm": 1.2172460556030273, "learning_rate": 4.9165665015603465e-05, "loss": 0.2258, "num_input_tokens_seen": 1278424, "step": 6705 }, { "epoch": 3.4875259875259874, "grad_norm": 0.41738569736480713, "learning_rate": 4.9162757547678485e-05, "loss": 0.0745, "num_input_tokens_seen": 1279288, "step": 6710 }, { "epoch": 3.49012474012474, "grad_norm": 0.4151587188243866, "learning_rate": 4.9159845108898486e-05, "loss": 0.1493, "num_input_tokens_seen": 1280216, "step": 6715 }, { "epoch": 3.492723492723493, "grad_norm": 0.4038012623786926, "learning_rate": 4.915692769986261e-05, "loss": 0.1532, "num_input_tokens_seen": 1281144, "step": 6720 }, { "epoch": 3.4953222453222454, "grad_norm": 0.4873545467853546, "learning_rate": 4.9154005321171046e-05, "loss": 0.1497, "num_input_tokens_seen": 1282072, "step": 6725 }, { "epoch": 3.497920997920998, "grad_norm": 0.9372350573539734, "learning_rate": 4.915107797342501e-05, "loss": 0.123, "num_input_tokens_seen": 1283000, "step": 6730 }, { "epoch": 3.5005197505197505, "grad_norm": 0.6824483871459961, "learning_rate": 4.914814565722671e-05, "loss": 0.0916, "num_input_tokens_seen": 1284088, "step": 6735 }, { "epoch": 3.503118503118503, "grad_norm": 1.6511067152023315, "learning_rate": 4.91452083731794e-05, "loss": 0.1536, "num_input_tokens_seen": 1285016, "step": 6740 }, { "epoch": 3.5057172557172556, "grad_norm": 0.35220545530319214, "learning_rate": 4.914226612188735e-05, "loss": 0.2033, "num_input_tokens_seen": 1285912, "step": 6745 }, { "epoch": 3.508316008316008, "grad_norm": 0.7934876680374146, "learning_rate": 4.9139318903955846e-05, "loss": 0.1439, "num_input_tokens_seen": 1286904, "step": 6750 }, { "epoch": 3.510914760914761, "grad_norm": 0.2575710117816925, "learning_rate": 4.91363667199912e-05, "loss": 0.1125, "num_input_tokens_seen": 1287864, "step": 6755 }, { "epoch": 3.5135135135135136, "grad_norm": 0.19321459531784058, "learning_rate": 4.9133409570600754e-05, "loss": 0.1519, "num_input_tokens_seen": 1288824, "step": 6760 }, { "epoch": 3.516112266112266, "grad_norm": 0.6644372940063477, "learning_rate": 4.913044745639286e-05, "loss": 0.1131, "num_input_tokens_seen": 1289784, "step": 6765 }, { "epoch": 3.5187110187110187, "grad_norm": 0.450752854347229, "learning_rate": 4.91274803779769e-05, "loss": 0.1076, "num_input_tokens_seen": 1290712, "step": 6770 }, { "epoch": 3.521309771309771, "grad_norm": 0.18679001927375793, "learning_rate": 4.912450833596327e-05, "loss": 0.0753, "num_input_tokens_seen": 1291672, "step": 6775 }, { "epoch": 3.523908523908524, "grad_norm": 0.5110749006271362, "learning_rate": 4.9121531330963384e-05, "loss": 0.2535, "num_input_tokens_seen": 1292664, "step": 6780 }, { "epoch": 3.5265072765072762, "grad_norm": 0.5792052149772644, "learning_rate": 4.9118549363589694e-05, "loss": 0.119, "num_input_tokens_seen": 1293592, "step": 6785 }, { "epoch": 3.529106029106029, "grad_norm": 0.5540063977241516, "learning_rate": 4.911556243445565e-05, "loss": 0.1523, "num_input_tokens_seen": 1294552, "step": 6790 }, { "epoch": 3.5317047817047817, "grad_norm": 0.6056128740310669, "learning_rate": 4.911257054417574e-05, "loss": 0.2066, "num_input_tokens_seen": 1295576, "step": 6795 }, { "epoch": 3.5343035343035343, "grad_norm": 0.17812615633010864, "learning_rate": 4.910957369336547e-05, "loss": 0.1088, "num_input_tokens_seen": 1296536, "step": 6800 }, { "epoch": 3.536902286902287, "grad_norm": 2.249868869781494, "learning_rate": 4.910657188264136e-05, "loss": 0.1985, "num_input_tokens_seen": 1297528, "step": 6805 }, { "epoch": 3.5395010395010393, "grad_norm": 0.7053660750389099, "learning_rate": 4.910356511262095e-05, "loss": 0.2152, "num_input_tokens_seen": 1298488, "step": 6810 }, { "epoch": 3.5420997920997923, "grad_norm": 1.2508329153060913, "learning_rate": 4.910055338392282e-05, "loss": 0.1752, "num_input_tokens_seen": 1299448, "step": 6815 }, { "epoch": 3.544698544698545, "grad_norm": 0.35682621598243713, "learning_rate": 4.9097536697166534e-05, "loss": 0.2407, "num_input_tokens_seen": 1300376, "step": 6820 }, { "epoch": 3.5472972972972974, "grad_norm": 1.0146355628967285, "learning_rate": 4.909451505297271e-05, "loss": 0.229, "num_input_tokens_seen": 1301304, "step": 6825 }, { "epoch": 3.54989604989605, "grad_norm": 0.626995325088501, "learning_rate": 4.909148845196297e-05, "loss": 0.0998, "num_input_tokens_seen": 1302264, "step": 6830 }, { "epoch": 3.5524948024948024, "grad_norm": 0.711357593536377, "learning_rate": 4.908845689475995e-05, "loss": 0.1449, "num_input_tokens_seen": 1303256, "step": 6835 }, { "epoch": 3.555093555093555, "grad_norm": 0.38713762164115906, "learning_rate": 4.9085420381987326e-05, "loss": 0.1303, "num_input_tokens_seen": 1304184, "step": 6840 }, { "epoch": 3.5576923076923075, "grad_norm": 0.08467714488506317, "learning_rate": 4.908237891426977e-05, "loss": 0.1043, "num_input_tokens_seen": 1305144, "step": 6845 }, { "epoch": 3.5602910602910605, "grad_norm": 0.4531896710395813, "learning_rate": 4.907933249223299e-05, "loss": 0.2039, "num_input_tokens_seen": 1306104, "step": 6850 }, { "epoch": 3.562889812889813, "grad_norm": 0.7044157981872559, "learning_rate": 4.907628111650371e-05, "loss": 0.262, "num_input_tokens_seen": 1307096, "step": 6855 }, { "epoch": 3.5654885654885655, "grad_norm": 1.131232738494873, "learning_rate": 4.907322478770966e-05, "loss": 0.1271, "num_input_tokens_seen": 1307960, "step": 6860 }, { "epoch": 3.568087318087318, "grad_norm": 0.3440249264240265, "learning_rate": 4.907016350647961e-05, "loss": 0.0934, "num_input_tokens_seen": 1308888, "step": 6865 }, { "epoch": 3.5706860706860706, "grad_norm": 0.22169166803359985, "learning_rate": 4.906709727344334e-05, "loss": 0.1454, "num_input_tokens_seen": 1309912, "step": 6870 }, { "epoch": 3.5732848232848236, "grad_norm": 0.6095912456512451, "learning_rate": 4.906402608923164e-05, "loss": 0.2024, "num_input_tokens_seen": 1310904, "step": 6875 }, { "epoch": 3.5758835758835756, "grad_norm": 0.1719912886619568, "learning_rate": 4.906094995447632e-05, "loss": 0.1395, "num_input_tokens_seen": 1311928, "step": 6880 }, { "epoch": 3.5784823284823286, "grad_norm": 0.9714329838752747, "learning_rate": 4.905786886981023e-05, "loss": 0.2134, "num_input_tokens_seen": 1312888, "step": 6885 }, { "epoch": 3.581081081081081, "grad_norm": 0.7357578873634338, "learning_rate": 4.905478283586722e-05, "loss": 0.2547, "num_input_tokens_seen": 1313848, "step": 6890 }, { "epoch": 3.5836798336798337, "grad_norm": 1.639349341392517, "learning_rate": 4.905169185328216e-05, "loss": 0.1794, "num_input_tokens_seen": 1314776, "step": 6895 }, { "epoch": 3.586278586278586, "grad_norm": 0.6580598950386047, "learning_rate": 4.904859592269092e-05, "loss": 0.1867, "num_input_tokens_seen": 1315800, "step": 6900 }, { "epoch": 3.5888773388773387, "grad_norm": 0.43988707661628723, "learning_rate": 4.904549504473043e-05, "loss": 0.1206, "num_input_tokens_seen": 1316728, "step": 6905 }, { "epoch": 3.5914760914760917, "grad_norm": 1.106540560722351, "learning_rate": 4.90423892200386e-05, "loss": 0.1991, "num_input_tokens_seen": 1317656, "step": 6910 }, { "epoch": 3.5940748440748442, "grad_norm": 0.9110885858535767, "learning_rate": 4.903927844925438e-05, "loss": 0.1854, "num_input_tokens_seen": 1318680, "step": 6915 }, { "epoch": 3.5966735966735968, "grad_norm": 0.44715169072151184, "learning_rate": 4.903616273301773e-05, "loss": 0.174, "num_input_tokens_seen": 1319672, "step": 6920 }, { "epoch": 3.5992723492723493, "grad_norm": 0.8274012804031372, "learning_rate": 4.903304207196962e-05, "loss": 0.1632, "num_input_tokens_seen": 1320696, "step": 6925 }, { "epoch": 3.601871101871102, "grad_norm": 0.54121994972229, "learning_rate": 4.902991646675206e-05, "loss": 0.0835, "num_input_tokens_seen": 1321624, "step": 6930 }, { "epoch": 3.6044698544698544, "grad_norm": 0.3276815116405487, "learning_rate": 4.902678591800804e-05, "loss": 0.056, "num_input_tokens_seen": 1322488, "step": 6935 }, { "epoch": 3.607068607068607, "grad_norm": 0.17262549698352814, "learning_rate": 4.90236504263816e-05, "loss": 0.1155, "num_input_tokens_seen": 1323416, "step": 6940 }, { "epoch": 3.60966735966736, "grad_norm": 0.6190741658210754, "learning_rate": 4.902050999251779e-05, "loss": 0.159, "num_input_tokens_seen": 1324376, "step": 6945 }, { "epoch": 3.6122661122661124, "grad_norm": 1.3962609767913818, "learning_rate": 4.901736461706266e-05, "loss": 0.305, "num_input_tokens_seen": 1325336, "step": 6950 }, { "epoch": 3.614864864864865, "grad_norm": 1.1562913656234741, "learning_rate": 4.9014214300663304e-05, "loss": 0.1244, "num_input_tokens_seen": 1326264, "step": 6955 }, { "epoch": 3.6174636174636174, "grad_norm": 0.378842830657959, "learning_rate": 4.90110590439678e-05, "loss": 0.1405, "num_input_tokens_seen": 1327256, "step": 6960 }, { "epoch": 3.62006237006237, "grad_norm": 0.4657372236251831, "learning_rate": 4.900789884762527e-05, "loss": 0.2175, "num_input_tokens_seen": 1328216, "step": 6965 }, { "epoch": 3.6226611226611225, "grad_norm": 0.4666183292865753, "learning_rate": 4.900473371228585e-05, "loss": 0.1234, "num_input_tokens_seen": 1329048, "step": 6970 }, { "epoch": 3.625259875259875, "grad_norm": 0.30762144923210144, "learning_rate": 4.900156363860066e-05, "loss": 0.1141, "num_input_tokens_seen": 1329944, "step": 6975 }, { "epoch": 3.627858627858628, "grad_norm": 0.5807668566703796, "learning_rate": 4.899838862722187e-05, "loss": 0.1164, "num_input_tokens_seen": 1330904, "step": 6980 }, { "epoch": 3.6304573804573805, "grad_norm": 0.2316208779811859, "learning_rate": 4.8995208678802666e-05, "loss": 0.1831, "num_input_tokens_seen": 1331800, "step": 6985 }, { "epoch": 3.633056133056133, "grad_norm": 0.7228038907051086, "learning_rate": 4.899202379399723e-05, "loss": 0.1585, "num_input_tokens_seen": 1332760, "step": 6990 }, { "epoch": 3.6356548856548856, "grad_norm": 0.22757534682750702, "learning_rate": 4.898883397346077e-05, "loss": 0.1145, "num_input_tokens_seen": 1333688, "step": 6995 }, { "epoch": 3.638253638253638, "grad_norm": 0.575699508190155, "learning_rate": 4.898563921784951e-05, "loss": 0.2395, "num_input_tokens_seen": 1334648, "step": 7000 }, { "epoch": 3.640852390852391, "grad_norm": 0.5311689376831055, "learning_rate": 4.898243952782069e-05, "loss": 0.1199, "num_input_tokens_seen": 1335608, "step": 7005 }, { "epoch": 3.643451143451143, "grad_norm": 0.614203155040741, "learning_rate": 4.897923490403256e-05, "loss": 0.1229, "num_input_tokens_seen": 1336568, "step": 7010 }, { "epoch": 3.646049896049896, "grad_norm": 1.9789725542068481, "learning_rate": 4.897602534714437e-05, "loss": 0.1925, "num_input_tokens_seen": 1337624, "step": 7015 }, { "epoch": 3.6486486486486487, "grad_norm": 0.9414392113685608, "learning_rate": 4.897281085781643e-05, "loss": 0.2047, "num_input_tokens_seen": 1338584, "step": 7020 }, { "epoch": 3.651247401247401, "grad_norm": 1.477655053138733, "learning_rate": 4.896959143671002e-05, "loss": 0.2638, "num_input_tokens_seen": 1339448, "step": 7025 }, { "epoch": 3.6538461538461537, "grad_norm": 0.3039918839931488, "learning_rate": 4.8966367084487466e-05, "loss": 0.1712, "num_input_tokens_seen": 1340408, "step": 7030 }, { "epoch": 3.6564449064449063, "grad_norm": 0.21537841856479645, "learning_rate": 4.8963137801812074e-05, "loss": 0.1876, "num_input_tokens_seen": 1341304, "step": 7035 }, { "epoch": 3.6590436590436592, "grad_norm": 0.41826096177101135, "learning_rate": 4.89599035893482e-05, "loss": 0.1186, "num_input_tokens_seen": 1342328, "step": 7040 }, { "epoch": 3.6616424116424118, "grad_norm": 1.3781179189682007, "learning_rate": 4.895666444776119e-05, "loss": 0.2474, "num_input_tokens_seen": 1343256, "step": 7045 }, { "epoch": 3.6642411642411643, "grad_norm": 0.12050219625234604, "learning_rate": 4.895342037771743e-05, "loss": 0.1155, "num_input_tokens_seen": 1344216, "step": 7050 }, { "epoch": 3.666839916839917, "grad_norm": 0.4921064078807831, "learning_rate": 4.8950171379884273e-05, "loss": 0.1229, "num_input_tokens_seen": 1345176, "step": 7055 }, { "epoch": 3.6694386694386694, "grad_norm": 0.20100392401218414, "learning_rate": 4.8946917454930144e-05, "loss": 0.0836, "num_input_tokens_seen": 1346104, "step": 7060 }, { "epoch": 3.672037422037422, "grad_norm": 0.16628436744213104, "learning_rate": 4.8943658603524435e-05, "loss": 0.1632, "num_input_tokens_seen": 1347096, "step": 7065 }, { "epoch": 3.6746361746361744, "grad_norm": 0.8005128502845764, "learning_rate": 4.894039482633757e-05, "loss": 0.1767, "num_input_tokens_seen": 1348056, "step": 7070 }, { "epoch": 3.6772349272349274, "grad_norm": 0.9390827417373657, "learning_rate": 4.893712612404099e-05, "loss": 0.1859, "num_input_tokens_seen": 1348952, "step": 7075 }, { "epoch": 3.67983367983368, "grad_norm": 0.22372958064079285, "learning_rate": 4.8933852497307156e-05, "loss": 0.1259, "num_input_tokens_seen": 1349976, "step": 7080 }, { "epoch": 3.6824324324324325, "grad_norm": 0.8751797676086426, "learning_rate": 4.893057394680951e-05, "loss": 0.1508, "num_input_tokens_seen": 1350904, "step": 7085 }, { "epoch": 3.685031185031185, "grad_norm": 0.5353655219078064, "learning_rate": 4.892729047322254e-05, "loss": 0.188, "num_input_tokens_seen": 1351800, "step": 7090 }, { "epoch": 3.6876299376299375, "grad_norm": 0.24794818460941315, "learning_rate": 4.892400207722174e-05, "loss": 0.1607, "num_input_tokens_seen": 1352696, "step": 7095 }, { "epoch": 3.6902286902286905, "grad_norm": 1.4771993160247803, "learning_rate": 4.892070875948359e-05, "loss": 0.2085, "num_input_tokens_seen": 1353592, "step": 7100 }, { "epoch": 3.6928274428274426, "grad_norm": 0.447824090719223, "learning_rate": 4.8917410520685635e-05, "loss": 0.1368, "num_input_tokens_seen": 1354456, "step": 7105 }, { "epoch": 3.6954261954261955, "grad_norm": 0.9023045301437378, "learning_rate": 4.8914107361506376e-05, "loss": 0.1257, "num_input_tokens_seen": 1355384, "step": 7110 }, { "epoch": 3.698024948024948, "grad_norm": 0.7746492028236389, "learning_rate": 4.891079928262537e-05, "loss": 0.1052, "num_input_tokens_seen": 1356344, "step": 7115 }, { "epoch": 3.7006237006237006, "grad_norm": 1.2725846767425537, "learning_rate": 4.890748628472314e-05, "loss": 0.2008, "num_input_tokens_seen": 1357400, "step": 7120 }, { "epoch": 3.703222453222453, "grad_norm": 1.3447749614715576, "learning_rate": 4.890416836848127e-05, "loss": 0.1151, "num_input_tokens_seen": 1358424, "step": 7125 }, { "epoch": 3.7058212058212057, "grad_norm": 0.7918412089347839, "learning_rate": 4.890084553458234e-05, "loss": 0.147, "num_input_tokens_seen": 1359352, "step": 7130 }, { "epoch": 3.7084199584199586, "grad_norm": 0.4994165897369385, "learning_rate": 4.889751778370993e-05, "loss": 0.0858, "num_input_tokens_seen": 1360248, "step": 7135 }, { "epoch": 3.711018711018711, "grad_norm": 0.4039507806301117, "learning_rate": 4.889418511654863e-05, "loss": 0.1279, "num_input_tokens_seen": 1361240, "step": 7140 }, { "epoch": 3.7136174636174637, "grad_norm": 0.9769052267074585, "learning_rate": 4.889084753378405e-05, "loss": 0.1466, "num_input_tokens_seen": 1362264, "step": 7145 }, { "epoch": 3.7162162162162162, "grad_norm": 0.7385595440864563, "learning_rate": 4.888750503610282e-05, "loss": 0.114, "num_input_tokens_seen": 1363224, "step": 7150 }, { "epoch": 3.7188149688149688, "grad_norm": 0.3717707395553589, "learning_rate": 4.888415762419256e-05, "loss": 0.0807, "num_input_tokens_seen": 1364152, "step": 7155 }, { "epoch": 3.7214137214137213, "grad_norm": 0.533104658126831, "learning_rate": 4.8880805298741915e-05, "loss": 0.0832, "num_input_tokens_seen": 1364984, "step": 7160 }, { "epoch": 3.724012474012474, "grad_norm": 0.3699653446674347, "learning_rate": 4.887744806044055e-05, "loss": 0.1242, "num_input_tokens_seen": 1365912, "step": 7165 }, { "epoch": 3.726611226611227, "grad_norm": 0.22786255180835724, "learning_rate": 4.8874085909979116e-05, "loss": 0.1055, "num_input_tokens_seen": 1366808, "step": 7170 }, { "epoch": 3.7292099792099793, "grad_norm": 0.4440445005893707, "learning_rate": 4.887071884804929e-05, "loss": 0.0687, "num_input_tokens_seen": 1367800, "step": 7175 }, { "epoch": 3.731808731808732, "grad_norm": 0.16344811022281647, "learning_rate": 4.886734687534377e-05, "loss": 0.084, "num_input_tokens_seen": 1368824, "step": 7180 }, { "epoch": 3.7344074844074844, "grad_norm": 0.9185784459114075, "learning_rate": 4.886396999255622e-05, "loss": 0.193, "num_input_tokens_seen": 1369752, "step": 7185 }, { "epoch": 3.737006237006237, "grad_norm": 0.16213004291057587, "learning_rate": 4.8860588200381376e-05, "loss": 0.1258, "num_input_tokens_seen": 1370744, "step": 7190 }, { "epoch": 3.73960498960499, "grad_norm": 0.7904793620109558, "learning_rate": 4.8857201499514946e-05, "loss": 0.1582, "num_input_tokens_seen": 1371736, "step": 7195 }, { "epoch": 3.742203742203742, "grad_norm": 0.41405579447746277, "learning_rate": 4.885380989065365e-05, "loss": 0.1654, "num_input_tokens_seen": 1372792, "step": 7200 }, { "epoch": 3.744802494802495, "grad_norm": 0.29308775067329407, "learning_rate": 4.885041337449522e-05, "loss": 0.2036, "num_input_tokens_seen": 1373816, "step": 7205 }, { "epoch": 3.7474012474012475, "grad_norm": 0.792744517326355, "learning_rate": 4.8847011951738395e-05, "loss": 0.1572, "num_input_tokens_seen": 1374808, "step": 7210 }, { "epoch": 3.75, "grad_norm": 0.11400050669908524, "learning_rate": 4.884360562308295e-05, "loss": 0.0793, "num_input_tokens_seen": 1375768, "step": 7215 }, { "epoch": 3.7525987525987525, "grad_norm": 0.7597236633300781, "learning_rate": 4.8840194389229634e-05, "loss": 0.2163, "num_input_tokens_seen": 1376696, "step": 7220 }, { "epoch": 3.755197505197505, "grad_norm": 0.19584162533283234, "learning_rate": 4.883677825088022e-05, "loss": 0.1273, "num_input_tokens_seen": 1377688, "step": 7225 }, { "epoch": 3.757796257796258, "grad_norm": 1.260856032371521, "learning_rate": 4.883335720873749e-05, "loss": 0.2213, "num_input_tokens_seen": 1378584, "step": 7230 }, { "epoch": 3.76039501039501, "grad_norm": 0.707770824432373, "learning_rate": 4.882993126350524e-05, "loss": 0.1828, "num_input_tokens_seen": 1379576, "step": 7235 }, { "epoch": 3.762993762993763, "grad_norm": 0.7112668752670288, "learning_rate": 4.8826500415888255e-05, "loss": 0.1542, "num_input_tokens_seen": 1380536, "step": 7240 }, { "epoch": 3.7655925155925156, "grad_norm": 1.1054234504699707, "learning_rate": 4.882306466659235e-05, "loss": 0.1357, "num_input_tokens_seen": 1381528, "step": 7245 }, { "epoch": 3.768191268191268, "grad_norm": 0.11764021217823029, "learning_rate": 4.881962401632434e-05, "loss": 0.0847, "num_input_tokens_seen": 1382520, "step": 7250 }, { "epoch": 3.7707900207900207, "grad_norm": 0.02827267535030842, "learning_rate": 4.881617846579205e-05, "loss": 0.2237, "num_input_tokens_seen": 1383416, "step": 7255 }, { "epoch": 3.773388773388773, "grad_norm": 1.0276066064834595, "learning_rate": 4.8812728015704314e-05, "loss": 0.2359, "num_input_tokens_seen": 1384312, "step": 7260 }, { "epoch": 3.775987525987526, "grad_norm": 1.6954423189163208, "learning_rate": 4.8809272666770964e-05, "loss": 0.2049, "num_input_tokens_seen": 1385176, "step": 7265 }, { "epoch": 3.7785862785862787, "grad_norm": 0.4647030234336853, "learning_rate": 4.880581241970286e-05, "loss": 0.1862, "num_input_tokens_seen": 1386168, "step": 7270 }, { "epoch": 3.7811850311850312, "grad_norm": 0.19639945030212402, "learning_rate": 4.880234727521184e-05, "loss": 0.0693, "num_input_tokens_seen": 1387096, "step": 7275 }, { "epoch": 3.7837837837837838, "grad_norm": 0.5337965488433838, "learning_rate": 4.879887723401078e-05, "loss": 0.1051, "num_input_tokens_seen": 1388024, "step": 7280 }, { "epoch": 3.7863825363825363, "grad_norm": 0.17774751782417297, "learning_rate": 4.879540229681354e-05, "loss": 0.0776, "num_input_tokens_seen": 1388984, "step": 7285 }, { "epoch": 3.788981288981289, "grad_norm": 0.49591007828712463, "learning_rate": 4.8791922464335015e-05, "loss": 0.1526, "num_input_tokens_seen": 1389976, "step": 7290 }, { "epoch": 3.7915800415800414, "grad_norm": 1.3110102415084839, "learning_rate": 4.878843773729107e-05, "loss": 0.1994, "num_input_tokens_seen": 1390936, "step": 7295 }, { "epoch": 3.7941787941787943, "grad_norm": 0.25815677642822266, "learning_rate": 4.878494811639861e-05, "loss": 0.181, "num_input_tokens_seen": 1391864, "step": 7300 }, { "epoch": 3.796777546777547, "grad_norm": 0.3854551315307617, "learning_rate": 4.878145360237553e-05, "loss": 0.1275, "num_input_tokens_seen": 1392856, "step": 7305 }, { "epoch": 3.7993762993762994, "grad_norm": 0.9005913734436035, "learning_rate": 4.877795419594073e-05, "loss": 0.1156, "num_input_tokens_seen": 1393752, "step": 7310 }, { "epoch": 3.801975051975052, "grad_norm": 0.6769879460334778, "learning_rate": 4.877444989781412e-05, "loss": 0.2403, "num_input_tokens_seen": 1394712, "step": 7315 }, { "epoch": 3.8045738045738045, "grad_norm": 0.2029801309108734, "learning_rate": 4.877094070871663e-05, "loss": 0.2812, "num_input_tokens_seen": 1395704, "step": 7320 }, { "epoch": 3.8071725571725574, "grad_norm": 0.7259919047355652, "learning_rate": 4.876742662937016e-05, "loss": 0.1999, "num_input_tokens_seen": 1396600, "step": 7325 }, { "epoch": 3.8097713097713095, "grad_norm": 1.2807962894439697, "learning_rate": 4.8763907660497666e-05, "loss": 0.1495, "num_input_tokens_seen": 1397592, "step": 7330 }, { "epoch": 3.8123700623700625, "grad_norm": 0.8540860414505005, "learning_rate": 4.8760383802823074e-05, "loss": 0.1195, "num_input_tokens_seen": 1398520, "step": 7335 }, { "epoch": 3.814968814968815, "grad_norm": 1.4343878030776978, "learning_rate": 4.875685505707133e-05, "loss": 0.3064, "num_input_tokens_seen": 1399416, "step": 7340 }, { "epoch": 3.8175675675675675, "grad_norm": 0.1916428506374359, "learning_rate": 4.875332142396836e-05, "loss": 0.1515, "num_input_tokens_seen": 1400344, "step": 7345 }, { "epoch": 3.82016632016632, "grad_norm": 0.9471240639686584, "learning_rate": 4.874978290424114e-05, "loss": 0.1755, "num_input_tokens_seen": 1401272, "step": 7350 }, { "epoch": 3.8227650727650726, "grad_norm": 0.366447389125824, "learning_rate": 4.874623949861762e-05, "loss": 0.1507, "num_input_tokens_seen": 1402232, "step": 7355 }, { "epoch": 3.8253638253638256, "grad_norm": 0.6198863983154297, "learning_rate": 4.874269120782676e-05, "loss": 0.1669, "num_input_tokens_seen": 1403192, "step": 7360 }, { "epoch": 3.827962577962578, "grad_norm": 0.6613719463348389, "learning_rate": 4.8739138032598544e-05, "loss": 0.1622, "num_input_tokens_seen": 1404088, "step": 7365 }, { "epoch": 3.8305613305613306, "grad_norm": 0.33669236302375793, "learning_rate": 4.8735579973663916e-05, "loss": 0.2036, "num_input_tokens_seen": 1405048, "step": 7370 }, { "epoch": 3.833160083160083, "grad_norm": 1.9676287174224854, "learning_rate": 4.873201703175487e-05, "loss": 0.192, "num_input_tokens_seen": 1405976, "step": 7375 }, { "epoch": 3.8357588357588357, "grad_norm": 0.13440999388694763, "learning_rate": 4.87284492076044e-05, "loss": 0.1235, "num_input_tokens_seen": 1406904, "step": 7380 }, { "epoch": 3.8383575883575882, "grad_norm": 0.6055276393890381, "learning_rate": 4.872487650194647e-05, "loss": 0.1298, "num_input_tokens_seen": 1407896, "step": 7385 }, { "epoch": 3.8409563409563408, "grad_norm": 0.486114501953125, "learning_rate": 4.872129891551608e-05, "loss": 0.1096, "num_input_tokens_seen": 1408824, "step": 7390 }, { "epoch": 3.8435550935550937, "grad_norm": 0.7698351144790649, "learning_rate": 4.871771644904923e-05, "loss": 0.1163, "num_input_tokens_seen": 1409784, "step": 7395 }, { "epoch": 3.8461538461538463, "grad_norm": 0.4000529944896698, "learning_rate": 4.8714129103282916e-05, "loss": 0.2135, "num_input_tokens_seen": 1410680, "step": 7400 }, { "epoch": 3.848752598752599, "grad_norm": 0.9936002492904663, "learning_rate": 4.871053687895513e-05, "loss": 0.2389, "num_input_tokens_seen": 1411608, "step": 7405 }, { "epoch": 3.8513513513513513, "grad_norm": 0.31291675567626953, "learning_rate": 4.8706939776804896e-05, "loss": 0.0742, "num_input_tokens_seen": 1412600, "step": 7410 }, { "epoch": 3.853950103950104, "grad_norm": 1.4716601371765137, "learning_rate": 4.8703337797572206e-05, "loss": 0.2163, "num_input_tokens_seen": 1413592, "step": 7415 }, { "epoch": 3.856548856548857, "grad_norm": 0.500347912311554, "learning_rate": 4.8699730941998086e-05, "loss": 0.1139, "num_input_tokens_seen": 1414488, "step": 7420 }, { "epoch": 3.859147609147609, "grad_norm": 0.2801614999771118, "learning_rate": 4.8696119210824544e-05, "loss": 0.1969, "num_input_tokens_seen": 1415448, "step": 7425 }, { "epoch": 3.861746361746362, "grad_norm": 1.4576160907745361, "learning_rate": 4.86925026047946e-05, "loss": 0.172, "num_input_tokens_seen": 1416440, "step": 7430 }, { "epoch": 3.8643451143451144, "grad_norm": 0.9455264210700989, "learning_rate": 4.8688881124652284e-05, "loss": 0.1391, "num_input_tokens_seen": 1417400, "step": 7435 }, { "epoch": 3.866943866943867, "grad_norm": 0.22575391829013824, "learning_rate": 4.868525477114262e-05, "loss": 0.1494, "num_input_tokens_seen": 1418296, "step": 7440 }, { "epoch": 3.8695426195426195, "grad_norm": 0.5131872296333313, "learning_rate": 4.868162354501162e-05, "loss": 0.1345, "num_input_tokens_seen": 1419288, "step": 7445 }, { "epoch": 3.872141372141372, "grad_norm": 1.4304933547973633, "learning_rate": 4.867798744700633e-05, "loss": 0.2673, "num_input_tokens_seen": 1420248, "step": 7450 }, { "epoch": 3.874740124740125, "grad_norm": 0.23076634109020233, "learning_rate": 4.8674346477874774e-05, "loss": 0.1683, "num_input_tokens_seen": 1421272, "step": 7455 }, { "epoch": 3.8773388773388775, "grad_norm": 0.7619625329971313, "learning_rate": 4.8670700638365996e-05, "loss": 0.2346, "num_input_tokens_seen": 1422264, "step": 7460 }, { "epoch": 3.87993762993763, "grad_norm": 0.2102816253900528, "learning_rate": 4.8667049929230015e-05, "loss": 0.1329, "num_input_tokens_seen": 1423128, "step": 7465 }, { "epoch": 3.8825363825363826, "grad_norm": 0.3554080128669739, "learning_rate": 4.8663394351217884e-05, "loss": 0.0567, "num_input_tokens_seen": 1424056, "step": 7470 }, { "epoch": 3.885135135135135, "grad_norm": 0.23357237875461578, "learning_rate": 4.8659733905081634e-05, "loss": 0.1861, "num_input_tokens_seen": 1424984, "step": 7475 }, { "epoch": 3.8877338877338876, "grad_norm": 0.3813580870628357, "learning_rate": 4.865606859157431e-05, "loss": 0.1264, "num_input_tokens_seen": 1425912, "step": 7480 }, { "epoch": 3.89033264033264, "grad_norm": 1.4682844877243042, "learning_rate": 4.865239841144995e-05, "loss": 0.1675, "num_input_tokens_seen": 1426936, "step": 7485 }, { "epoch": 3.892931392931393, "grad_norm": 0.23628434538841248, "learning_rate": 4.864872336546361e-05, "loss": 0.0787, "num_input_tokens_seen": 1427864, "step": 7490 }, { "epoch": 3.8955301455301456, "grad_norm": 2.1597697734832764, "learning_rate": 4.864504345437132e-05, "loss": 0.2105, "num_input_tokens_seen": 1428824, "step": 7495 }, { "epoch": 3.898128898128898, "grad_norm": 1.3113296031951904, "learning_rate": 4.864135867893013e-05, "loss": 0.2337, "num_input_tokens_seen": 1429784, "step": 7500 }, { "epoch": 3.9007276507276507, "grad_norm": 0.5343814492225647, "learning_rate": 4.863766903989809e-05, "loss": 0.137, "num_input_tokens_seen": 1430648, "step": 7505 }, { "epoch": 3.9033264033264032, "grad_norm": 0.15864455699920654, "learning_rate": 4.8633974538034235e-05, "loss": 0.236, "num_input_tokens_seen": 1431544, "step": 7510 }, { "epoch": 3.9059251559251558, "grad_norm": 0.28019091486930847, "learning_rate": 4.863027517409863e-05, "loss": 0.1631, "num_input_tokens_seen": 1432472, "step": 7515 }, { "epoch": 3.9085239085239083, "grad_norm": 0.4320010542869568, "learning_rate": 4.8626570948852316e-05, "loss": 0.1127, "num_input_tokens_seen": 1433560, "step": 7520 }, { "epoch": 3.9111226611226613, "grad_norm": 0.33335408568382263, "learning_rate": 4.8622861863057325e-05, "loss": 0.1191, "num_input_tokens_seen": 1434424, "step": 7525 }, { "epoch": 3.913721413721414, "grad_norm": 0.22325798869132996, "learning_rate": 4.861914791747672e-05, "loss": 0.1488, "num_input_tokens_seen": 1435384, "step": 7530 }, { "epoch": 3.9163201663201663, "grad_norm": 0.984046459197998, "learning_rate": 4.861542911287454e-05, "loss": 0.2444, "num_input_tokens_seen": 1436376, "step": 7535 }, { "epoch": 3.918918918918919, "grad_norm": 0.2521229386329651, "learning_rate": 4.861170545001585e-05, "loss": 0.1691, "num_input_tokens_seen": 1437304, "step": 7540 }, { "epoch": 3.9215176715176714, "grad_norm": 0.4360942542552948, "learning_rate": 4.860797692966668e-05, "loss": 0.2327, "num_input_tokens_seen": 1438264, "step": 7545 }, { "epoch": 3.9241164241164244, "grad_norm": 0.45967522263526917, "learning_rate": 4.860424355259408e-05, "loss": 0.1931, "num_input_tokens_seen": 1439224, "step": 7550 }, { "epoch": 3.9267151767151764, "grad_norm": 0.5861141085624695, "learning_rate": 4.860050531956608e-05, "loss": 0.1781, "num_input_tokens_seen": 1440184, "step": 7555 }, { "epoch": 3.9293139293139294, "grad_norm": 1.2964575290679932, "learning_rate": 4.859676223135175e-05, "loss": 0.2853, "num_input_tokens_seen": 1441144, "step": 7560 }, { "epoch": 3.931912681912682, "grad_norm": 0.3082733154296875, "learning_rate": 4.8593014288721106e-05, "loss": 0.1229, "num_input_tokens_seen": 1442104, "step": 7565 }, { "epoch": 3.9345114345114345, "grad_norm": 1.1195558309555054, "learning_rate": 4.858926149244521e-05, "loss": 0.1636, "num_input_tokens_seen": 1443064, "step": 7570 }, { "epoch": 3.937110187110187, "grad_norm": 0.45798298716545105, "learning_rate": 4.8585503843296095e-05, "loss": 0.1715, "num_input_tokens_seen": 1444088, "step": 7575 }, { "epoch": 3.9397089397089395, "grad_norm": 0.6795455813407898, "learning_rate": 4.858174134204679e-05, "loss": 0.1688, "num_input_tokens_seen": 1445048, "step": 7580 }, { "epoch": 3.9423076923076925, "grad_norm": 0.7910305261611938, "learning_rate": 4.857797398947135e-05, "loss": 0.1529, "num_input_tokens_seen": 1446040, "step": 7585 }, { "epoch": 3.944906444906445, "grad_norm": 0.3660849630832672, "learning_rate": 4.857420178634479e-05, "loss": 0.0804, "num_input_tokens_seen": 1446936, "step": 7590 }, { "epoch": 3.9475051975051976, "grad_norm": 1.1058038473129272, "learning_rate": 4.857042473344315e-05, "loss": 0.1053, "num_input_tokens_seen": 1447832, "step": 7595 }, { "epoch": 3.95010395010395, "grad_norm": 0.9121595025062561, "learning_rate": 4.8566642831543464e-05, "loss": 0.1796, "num_input_tokens_seen": 1448824, "step": 7600 }, { "epoch": 3.9527027027027026, "grad_norm": 0.44097113609313965, "learning_rate": 4.856285608142375e-05, "loss": 0.133, "num_input_tokens_seen": 1449816, "step": 7605 }, { "epoch": 3.955301455301455, "grad_norm": 0.7374258041381836, "learning_rate": 4.855906448386304e-05, "loss": 0.1624, "num_input_tokens_seen": 1450744, "step": 7610 }, { "epoch": 3.9579002079002077, "grad_norm": 0.7315781116485596, "learning_rate": 4.855526803964136e-05, "loss": 0.1388, "num_input_tokens_seen": 1451704, "step": 7615 }, { "epoch": 3.9604989604989607, "grad_norm": 0.7006673216819763, "learning_rate": 4.855146674953971e-05, "loss": 0.137, "num_input_tokens_seen": 1452600, "step": 7620 }, { "epoch": 3.963097713097713, "grad_norm": 0.3789898753166199, "learning_rate": 4.854766061434013e-05, "loss": 0.1778, "num_input_tokens_seen": 1453528, "step": 7625 }, { "epoch": 3.9656964656964657, "grad_norm": 1.161561131477356, "learning_rate": 4.8543849634825615e-05, "loss": 0.1666, "num_input_tokens_seen": 1454520, "step": 7630 }, { "epoch": 3.9682952182952183, "grad_norm": 0.7467971444129944, "learning_rate": 4.8540033811780184e-05, "loss": 0.1475, "num_input_tokens_seen": 1455512, "step": 7635 }, { "epoch": 3.970893970893971, "grad_norm": 1.3430043458938599, "learning_rate": 4.853621314598884e-05, "loss": 0.1813, "num_input_tokens_seen": 1456536, "step": 7640 }, { "epoch": 3.9734927234927238, "grad_norm": 0.26527783274650574, "learning_rate": 4.8532387638237585e-05, "loss": 0.1389, "num_input_tokens_seen": 1457528, "step": 7645 }, { "epoch": 3.976091476091476, "grad_norm": 1.0304033756256104, "learning_rate": 4.852855728931342e-05, "loss": 0.1525, "num_input_tokens_seen": 1458456, "step": 7650 }, { "epoch": 3.978690228690229, "grad_norm": 0.8834590911865234, "learning_rate": 4.852472210000433e-05, "loss": 0.1567, "num_input_tokens_seen": 1459384, "step": 7655 }, { "epoch": 3.9812889812889813, "grad_norm": 0.14651522040367126, "learning_rate": 4.8520882071099304e-05, "loss": 0.0695, "num_input_tokens_seen": 1460376, "step": 7660 }, { "epoch": 3.983887733887734, "grad_norm": 1.6328476667404175, "learning_rate": 4.851703720338835e-05, "loss": 0.205, "num_input_tokens_seen": 1461336, "step": 7665 }, { "epoch": 3.9864864864864864, "grad_norm": 1.1127984523773193, "learning_rate": 4.8513187497662424e-05, "loss": 0.2127, "num_input_tokens_seen": 1462328, "step": 7670 }, { "epoch": 3.989085239085239, "grad_norm": 0.17677664756774902, "learning_rate": 4.850933295471351e-05, "loss": 0.1091, "num_input_tokens_seen": 1463288, "step": 7675 }, { "epoch": 3.991683991683992, "grad_norm": 0.48901084065437317, "learning_rate": 4.8505473575334584e-05, "loss": 0.1359, "num_input_tokens_seen": 1464248, "step": 7680 }, { "epoch": 3.9942827442827444, "grad_norm": 0.9498328566551208, "learning_rate": 4.8501609360319604e-05, "loss": 0.2422, "num_input_tokens_seen": 1465240, "step": 7685 }, { "epoch": 3.996881496881497, "grad_norm": 0.25248318910598755, "learning_rate": 4.849774031046354e-05, "loss": 0.1834, "num_input_tokens_seen": 1466200, "step": 7690 }, { "epoch": 3.9994802494802495, "grad_norm": 0.6555842757225037, "learning_rate": 4.8493866426562343e-05, "loss": 0.1038, "num_input_tokens_seen": 1467128, "step": 7695 }, { "epoch": 4.0, "eval_loss": 0.15451467037200928, "eval_runtime": 8.021, "eval_samples_per_second": 106.72, "eval_steps_per_second": 26.68, "num_input_tokens_seen": 1467248, "step": 7696 }, { "epoch": 4.002079002079002, "grad_norm": 0.7193356156349182, "learning_rate": 4.8489987709412966e-05, "loss": 0.0873, "num_input_tokens_seen": 1468048, "step": 7700 }, { "epoch": 4.004677754677755, "grad_norm": 0.3903200924396515, "learning_rate": 4.848610415981335e-05, "loss": 0.1371, "num_input_tokens_seen": 1468976, "step": 7705 }, { "epoch": 4.007276507276507, "grad_norm": 0.8941345810890198, "learning_rate": 4.848221577856243e-05, "loss": 0.2096, "num_input_tokens_seen": 1469936, "step": 7710 }, { "epoch": 4.00987525987526, "grad_norm": 0.34400683641433716, "learning_rate": 4.8478322566460154e-05, "loss": 0.1331, "num_input_tokens_seen": 1470896, "step": 7715 }, { "epoch": 4.012474012474012, "grad_norm": 0.6796194314956665, "learning_rate": 4.8474424524307435e-05, "loss": 0.1147, "num_input_tokens_seen": 1471856, "step": 7720 }, { "epoch": 4.015072765072765, "grad_norm": 0.621359646320343, "learning_rate": 4.847052165290619e-05, "loss": 0.16, "num_input_tokens_seen": 1472784, "step": 7725 }, { "epoch": 4.017671517671518, "grad_norm": 0.8330400586128235, "learning_rate": 4.8466613953059344e-05, "loss": 0.0759, "num_input_tokens_seen": 1473712, "step": 7730 }, { "epoch": 4.02027027027027, "grad_norm": 0.5501909255981445, "learning_rate": 4.84627014255708e-05, "loss": 0.1131, "num_input_tokens_seen": 1474608, "step": 7735 }, { "epoch": 4.022869022869023, "grad_norm": 1.5822776556015015, "learning_rate": 4.845878407124545e-05, "loss": 0.1864, "num_input_tokens_seen": 1475504, "step": 7740 }, { "epoch": 4.025467775467775, "grad_norm": 0.8413663506507874, "learning_rate": 4.8454861890889206e-05, "loss": 0.1694, "num_input_tokens_seen": 1476592, "step": 7745 }, { "epoch": 4.028066528066528, "grad_norm": 0.4007912576198578, "learning_rate": 4.845093488530894e-05, "loss": 0.1427, "num_input_tokens_seen": 1477552, "step": 7750 }, { "epoch": 4.03066528066528, "grad_norm": 0.23100174963474274, "learning_rate": 4.8447003055312526e-05, "loss": 0.1773, "num_input_tokens_seen": 1478512, "step": 7755 }, { "epoch": 4.033264033264033, "grad_norm": 0.9106606245040894, "learning_rate": 4.844306640170885e-05, "loss": 0.1264, "num_input_tokens_seen": 1479440, "step": 7760 }, { "epoch": 4.035862785862786, "grad_norm": 1.0842548608779907, "learning_rate": 4.843912492530776e-05, "loss": 0.2943, "num_input_tokens_seen": 1480368, "step": 7765 }, { "epoch": 4.038461538461538, "grad_norm": 0.32262030243873596, "learning_rate": 4.8435178626920126e-05, "loss": 0.2254, "num_input_tokens_seen": 1481392, "step": 7770 }, { "epoch": 4.041060291060291, "grad_norm": 0.5926834940910339, "learning_rate": 4.843122750735778e-05, "loss": 0.2187, "num_input_tokens_seen": 1482352, "step": 7775 }, { "epoch": 4.043659043659043, "grad_norm": 0.9387165904045105, "learning_rate": 4.842727156743358e-05, "loss": 0.1628, "num_input_tokens_seen": 1483344, "step": 7780 }, { "epoch": 4.046257796257796, "grad_norm": 0.33246949315071106, "learning_rate": 4.842331080796134e-05, "loss": 0.1986, "num_input_tokens_seen": 1484240, "step": 7785 }, { "epoch": 4.048856548856548, "grad_norm": 0.4833718240261078, "learning_rate": 4.84193452297559e-05, "loss": 0.137, "num_input_tokens_seen": 1485200, "step": 7790 }, { "epoch": 4.051455301455301, "grad_norm": 0.1650390774011612, "learning_rate": 4.8415374833633055e-05, "loss": 0.1274, "num_input_tokens_seen": 1486128, "step": 7795 }, { "epoch": 4.054054054054054, "grad_norm": 0.5156524777412415, "learning_rate": 4.841139962040962e-05, "loss": 0.2003, "num_input_tokens_seen": 1487120, "step": 7800 }, { "epoch": 4.0566528066528065, "grad_norm": 0.5868646502494812, "learning_rate": 4.840741959090339e-05, "loss": 0.2071, "num_input_tokens_seen": 1488080, "step": 7805 }, { "epoch": 4.0592515592515594, "grad_norm": 0.8411211967468262, "learning_rate": 4.840343474593315e-05, "loss": 0.1531, "num_input_tokens_seen": 1489104, "step": 7810 }, { "epoch": 4.0618503118503115, "grad_norm": 1.1120994091033936, "learning_rate": 4.839944508631869e-05, "loss": 0.1341, "num_input_tokens_seen": 1490096, "step": 7815 }, { "epoch": 4.0644490644490645, "grad_norm": 0.10873778909444809, "learning_rate": 4.8395450612880756e-05, "loss": 0.1075, "num_input_tokens_seen": 1491024, "step": 7820 }, { "epoch": 4.0670478170478175, "grad_norm": 0.160070538520813, "learning_rate": 4.839145132644113e-05, "loss": 0.1488, "num_input_tokens_seen": 1491984, "step": 7825 }, { "epoch": 4.06964656964657, "grad_norm": 0.648093581199646, "learning_rate": 4.8387447227822534e-05, "loss": 0.1007, "num_input_tokens_seen": 1493040, "step": 7830 }, { "epoch": 4.0722453222453225, "grad_norm": 0.23394738137722015, "learning_rate": 4.8383438317848726e-05, "loss": 0.1852, "num_input_tokens_seen": 1494000, "step": 7835 }, { "epoch": 4.074844074844075, "grad_norm": 0.6064674258232117, "learning_rate": 4.837942459734444e-05, "loss": 0.1973, "num_input_tokens_seen": 1494896, "step": 7840 }, { "epoch": 4.077442827442828, "grad_norm": 1.1266251802444458, "learning_rate": 4.837540606713538e-05, "loss": 0.1811, "num_input_tokens_seen": 1495920, "step": 7845 }, { "epoch": 4.08004158004158, "grad_norm": 0.6757094264030457, "learning_rate": 4.8371382728048244e-05, "loss": 0.154, "num_input_tokens_seen": 1496912, "step": 7850 }, { "epoch": 4.082640332640333, "grad_norm": 0.5245633125305176, "learning_rate": 4.836735458091075e-05, "loss": 0.1091, "num_input_tokens_seen": 1497872, "step": 7855 }, { "epoch": 4.085239085239086, "grad_norm": 1.0097341537475586, "learning_rate": 4.8363321626551575e-05, "loss": 0.173, "num_input_tokens_seen": 1498832, "step": 7860 }, { "epoch": 4.087837837837838, "grad_norm": 0.24066241085529327, "learning_rate": 4.8359283865800395e-05, "loss": 0.1234, "num_input_tokens_seen": 1499792, "step": 7865 }, { "epoch": 4.090436590436591, "grad_norm": 0.4299541711807251, "learning_rate": 4.835524129948787e-05, "loss": 0.2602, "num_input_tokens_seen": 1500784, "step": 7870 }, { "epoch": 4.093035343035343, "grad_norm": 0.3106630742549896, "learning_rate": 4.835119392844566e-05, "loss": 0.1681, "num_input_tokens_seen": 1501712, "step": 7875 }, { "epoch": 4.095634095634096, "grad_norm": 0.14845380187034607, "learning_rate": 4.83471417535064e-05, "loss": 0.1151, "num_input_tokens_seen": 1502704, "step": 7880 }, { "epoch": 4.098232848232848, "grad_norm": 1.7054848670959473, "learning_rate": 4.834308477550371e-05, "loss": 0.2753, "num_input_tokens_seen": 1503664, "step": 7885 }, { "epoch": 4.100831600831601, "grad_norm": 0.2744426429271698, "learning_rate": 4.833902299527222e-05, "loss": 0.1688, "num_input_tokens_seen": 1504592, "step": 7890 }, { "epoch": 4.103430353430354, "grad_norm": 0.5971371531486511, "learning_rate": 4.833495641364754e-05, "loss": 0.2128, "num_input_tokens_seen": 1505584, "step": 7895 }, { "epoch": 4.106029106029106, "grad_norm": 1.303605556488037, "learning_rate": 4.833088503146626e-05, "loss": 0.1947, "num_input_tokens_seen": 1506544, "step": 7900 }, { "epoch": 4.108627858627859, "grad_norm": 0.564113438129425, "learning_rate": 4.8326808849565936e-05, "loss": 0.1561, "num_input_tokens_seen": 1507376, "step": 7905 }, { "epoch": 4.111226611226611, "grad_norm": 0.37422311305999756, "learning_rate": 4.8322727868785165e-05, "loss": 0.1519, "num_input_tokens_seen": 1508368, "step": 7910 }, { "epoch": 4.113825363825364, "grad_norm": 0.3458324670791626, "learning_rate": 4.831864208996348e-05, "loss": 0.1407, "num_input_tokens_seen": 1509328, "step": 7915 }, { "epoch": 4.116424116424117, "grad_norm": 0.9334607124328613, "learning_rate": 4.8314551513941455e-05, "loss": 0.2191, "num_input_tokens_seen": 1510352, "step": 7920 }, { "epoch": 4.119022869022869, "grad_norm": 0.38763725757598877, "learning_rate": 4.831045614156058e-05, "loss": 0.0622, "num_input_tokens_seen": 1511280, "step": 7925 }, { "epoch": 4.121621621621622, "grad_norm": 0.8375234603881836, "learning_rate": 4.8306355973663396e-05, "loss": 0.1522, "num_input_tokens_seen": 1512176, "step": 7930 }, { "epoch": 4.124220374220374, "grad_norm": 0.5051099061965942, "learning_rate": 4.83022510110934e-05, "loss": 0.1323, "num_input_tokens_seen": 1513104, "step": 7935 }, { "epoch": 4.126819126819127, "grad_norm": 0.5362734198570251, "learning_rate": 4.8298141254695085e-05, "loss": 0.1354, "num_input_tokens_seen": 1514064, "step": 7940 }, { "epoch": 4.129417879417879, "grad_norm": 0.5445085763931274, "learning_rate": 4.829402670531391e-05, "loss": 0.1902, "num_input_tokens_seen": 1515056, "step": 7945 }, { "epoch": 4.132016632016632, "grad_norm": 0.30289292335510254, "learning_rate": 4.828990736379635e-05, "loss": 0.0746, "num_input_tokens_seen": 1516016, "step": 7950 }, { "epoch": 4.134615384615385, "grad_norm": 0.3790205121040344, "learning_rate": 4.828578323098985e-05, "loss": 0.0941, "num_input_tokens_seen": 1516976, "step": 7955 }, { "epoch": 4.137214137214137, "grad_norm": 0.6713950037956238, "learning_rate": 4.828165430774283e-05, "loss": 0.2661, "num_input_tokens_seen": 1517968, "step": 7960 }, { "epoch": 4.13981288981289, "grad_norm": 0.56611168384552, "learning_rate": 4.827752059490472e-05, "loss": 0.2026, "num_input_tokens_seen": 1518928, "step": 7965 }, { "epoch": 4.142411642411642, "grad_norm": 0.21140776574611664, "learning_rate": 4.827338209332593e-05, "loss": 0.0818, "num_input_tokens_seen": 1519888, "step": 7970 }, { "epoch": 4.145010395010395, "grad_norm": 0.6908942461013794, "learning_rate": 4.8269238803857834e-05, "loss": 0.2053, "num_input_tokens_seen": 1520880, "step": 7975 }, { "epoch": 4.147609147609147, "grad_norm": 0.420526385307312, "learning_rate": 4.8265090727352815e-05, "loss": 0.2175, "num_input_tokens_seen": 1521840, "step": 7980 }, { "epoch": 4.1502079002079, "grad_norm": 0.23095621168613434, "learning_rate": 4.826093786466422e-05, "loss": 0.0823, "num_input_tokens_seen": 1522640, "step": 7985 }, { "epoch": 4.152806652806653, "grad_norm": 0.34647563099861145, "learning_rate": 4.8256780216646403e-05, "loss": 0.1272, "num_input_tokens_seen": 1523504, "step": 7990 }, { "epoch": 4.155405405405405, "grad_norm": 0.20017534494400024, "learning_rate": 4.825261778415469e-05, "loss": 0.1472, "num_input_tokens_seen": 1524464, "step": 7995 }, { "epoch": 4.158004158004158, "grad_norm": 0.6403810977935791, "learning_rate": 4.824845056804538e-05, "loss": 0.1936, "num_input_tokens_seen": 1525488, "step": 8000 }, { "epoch": 4.16060291060291, "grad_norm": 0.966761589050293, "learning_rate": 4.824427856917579e-05, "loss": 0.1367, "num_input_tokens_seen": 1526384, "step": 8005 }, { "epoch": 4.163201663201663, "grad_norm": 0.2555440664291382, "learning_rate": 4.824010178840418e-05, "loss": 0.1348, "num_input_tokens_seen": 1527376, "step": 8010 }, { "epoch": 4.165800415800415, "grad_norm": 0.6623964309692383, "learning_rate": 4.823592022658982e-05, "loss": 0.1887, "num_input_tokens_seen": 1528368, "step": 8015 }, { "epoch": 4.168399168399168, "grad_norm": 0.3642486035823822, "learning_rate": 4.823173388459297e-05, "loss": 0.11, "num_input_tokens_seen": 1529360, "step": 8020 }, { "epoch": 4.170997920997921, "grad_norm": 0.32142841815948486, "learning_rate": 4.822754276327483e-05, "loss": 0.1352, "num_input_tokens_seen": 1530352, "step": 8025 }, { "epoch": 4.173596673596673, "grad_norm": 0.6734090447425842, "learning_rate": 4.8223346863497644e-05, "loss": 0.1019, "num_input_tokens_seen": 1531280, "step": 8030 }, { "epoch": 4.176195426195426, "grad_norm": 0.39237552881240845, "learning_rate": 4.82191461861246e-05, "loss": 0.1468, "num_input_tokens_seen": 1532272, "step": 8035 }, { "epoch": 4.1787941787941785, "grad_norm": 0.46001988649368286, "learning_rate": 4.821494073201987e-05, "loss": 0.2158, "num_input_tokens_seen": 1533232, "step": 8040 }, { "epoch": 4.1813929313929314, "grad_norm": 0.6022780537605286, "learning_rate": 4.821073050204861e-05, "loss": 0.1384, "num_input_tokens_seen": 1534160, "step": 8045 }, { "epoch": 4.183991683991684, "grad_norm": 0.5523654818534851, "learning_rate": 4.820651549707698e-05, "loss": 0.1547, "num_input_tokens_seen": 1535088, "step": 8050 }, { "epoch": 4.1865904365904365, "grad_norm": 0.5460628867149353, "learning_rate": 4.8202295717972106e-05, "loss": 0.1548, "num_input_tokens_seen": 1536048, "step": 8055 }, { "epoch": 4.1891891891891895, "grad_norm": 0.33440810441970825, "learning_rate": 4.819807116560209e-05, "loss": 0.1298, "num_input_tokens_seen": 1537008, "step": 8060 }, { "epoch": 4.191787941787942, "grad_norm": 0.3804326355457306, "learning_rate": 4.819384184083603e-05, "loss": 0.2737, "num_input_tokens_seen": 1537936, "step": 8065 }, { "epoch": 4.1943866943866945, "grad_norm": 1.1721735000610352, "learning_rate": 4.818960774454399e-05, "loss": 0.1957, "num_input_tokens_seen": 1538896, "step": 8070 }, { "epoch": 4.196985446985447, "grad_norm": 0.2646080553531647, "learning_rate": 4.8185368877597034e-05, "loss": 0.1153, "num_input_tokens_seen": 1539856, "step": 8075 }, { "epoch": 4.1995841995842, "grad_norm": 0.6109418272972107, "learning_rate": 4.818112524086719e-05, "loss": 0.0687, "num_input_tokens_seen": 1540816, "step": 8080 }, { "epoch": 4.202182952182953, "grad_norm": 0.12520982325077057, "learning_rate": 4.817687683522748e-05, "loss": 0.1392, "num_input_tokens_seen": 1541744, "step": 8085 }, { "epoch": 4.204781704781705, "grad_norm": 0.689740002155304, "learning_rate": 4.81726236615519e-05, "loss": 0.0788, "num_input_tokens_seen": 1542736, "step": 8090 }, { "epoch": 4.207380457380458, "grad_norm": 0.08759275078773499, "learning_rate": 4.816836572071543e-05, "loss": 0.1164, "num_input_tokens_seen": 1543664, "step": 8095 }, { "epoch": 4.20997920997921, "grad_norm": 0.0617809034883976, "learning_rate": 4.816410301359403e-05, "loss": 0.1192, "num_input_tokens_seen": 1544656, "step": 8100 }, { "epoch": 4.212577962577963, "grad_norm": 0.30212321877479553, "learning_rate": 4.815983554106464e-05, "loss": 0.1064, "num_input_tokens_seen": 1545552, "step": 8105 }, { "epoch": 4.215176715176715, "grad_norm": 0.6146848201751709, "learning_rate": 4.815556330400518e-05, "loss": 0.1379, "num_input_tokens_seen": 1546480, "step": 8110 }, { "epoch": 4.217775467775468, "grad_norm": 0.694129467010498, "learning_rate": 4.815128630329455e-05, "loss": 0.0942, "num_input_tokens_seen": 1547408, "step": 8115 }, { "epoch": 4.220374220374221, "grad_norm": 1.2995426654815674, "learning_rate": 4.8147004539812636e-05, "loss": 0.1231, "num_input_tokens_seen": 1548272, "step": 8120 }, { "epoch": 4.222972972972973, "grad_norm": 0.7313387989997864, "learning_rate": 4.814271801444029e-05, "loss": 0.1069, "num_input_tokens_seen": 1549200, "step": 8125 }, { "epoch": 4.225571725571726, "grad_norm": 0.6814384460449219, "learning_rate": 4.8138426728059375e-05, "loss": 0.1, "num_input_tokens_seen": 1550096, "step": 8130 }, { "epoch": 4.228170478170478, "grad_norm": 0.6097374558448792, "learning_rate": 4.813413068155267e-05, "loss": 0.0925, "num_input_tokens_seen": 1550992, "step": 8135 }, { "epoch": 4.230769230769231, "grad_norm": 1.2332196235656738, "learning_rate": 4.8129829875804014e-05, "loss": 0.1281, "num_input_tokens_seen": 1551952, "step": 8140 }, { "epoch": 4.233367983367984, "grad_norm": 0.989331066608429, "learning_rate": 4.812552431169817e-05, "loss": 0.1388, "num_input_tokens_seen": 1552976, "step": 8145 }, { "epoch": 4.235966735966736, "grad_norm": 0.3081319332122803, "learning_rate": 4.8121213990120886e-05, "loss": 0.1404, "num_input_tokens_seen": 1553872, "step": 8150 }, { "epoch": 4.238565488565489, "grad_norm": 0.11774322390556335, "learning_rate": 4.811689891195891e-05, "loss": 0.1078, "num_input_tokens_seen": 1554800, "step": 8155 }, { "epoch": 4.241164241164241, "grad_norm": 0.7520690560340881, "learning_rate": 4.8112579078099955e-05, "loss": 0.2182, "num_input_tokens_seen": 1555760, "step": 8160 }, { "epoch": 4.243762993762994, "grad_norm": 0.6901597380638123, "learning_rate": 4.8108254489432715e-05, "loss": 0.0868, "num_input_tokens_seen": 1556720, "step": 8165 }, { "epoch": 4.246361746361746, "grad_norm": 1.2187656164169312, "learning_rate": 4.810392514684685e-05, "loss": 0.1864, "num_input_tokens_seen": 1557776, "step": 8170 }, { "epoch": 4.248960498960499, "grad_norm": 1.05902898311615, "learning_rate": 4.8099591051233016e-05, "loss": 0.3343, "num_input_tokens_seen": 1558736, "step": 8175 }, { "epoch": 4.251559251559252, "grad_norm": 0.8275876641273499, "learning_rate": 4.809525220348284e-05, "loss": 0.095, "num_input_tokens_seen": 1559664, "step": 8180 }, { "epoch": 4.254158004158004, "grad_norm": 0.6300448179244995, "learning_rate": 4.809090860448894e-05, "loss": 0.1892, "num_input_tokens_seen": 1560624, "step": 8185 }, { "epoch": 4.256756756756757, "grad_norm": 0.8438617587089539, "learning_rate": 4.808656025514487e-05, "loss": 0.1039, "num_input_tokens_seen": 1561616, "step": 8190 }, { "epoch": 4.259355509355509, "grad_norm": 0.49955007433891296, "learning_rate": 4.8082207156345204e-05, "loss": 0.2085, "num_input_tokens_seen": 1562640, "step": 8195 }, { "epoch": 4.261954261954262, "grad_norm": 0.25704142451286316, "learning_rate": 4.807784930898549e-05, "loss": 0.218, "num_input_tokens_seen": 1563600, "step": 8200 }, { "epoch": 4.264553014553014, "grad_norm": 0.5616783499717712, "learning_rate": 4.807348671396221e-05, "loss": 0.1218, "num_input_tokens_seen": 1564560, "step": 8205 }, { "epoch": 4.267151767151767, "grad_norm": 0.4823022186756134, "learning_rate": 4.806911937217289e-05, "loss": 0.1383, "num_input_tokens_seen": 1565552, "step": 8210 }, { "epoch": 4.26975051975052, "grad_norm": 0.7460245490074158, "learning_rate": 4.806474728451597e-05, "loss": 0.198, "num_input_tokens_seen": 1566512, "step": 8215 }, { "epoch": 4.272349272349272, "grad_norm": 0.7781205773353577, "learning_rate": 4.80603704518909e-05, "loss": 0.1786, "num_input_tokens_seen": 1567344, "step": 8220 }, { "epoch": 4.274948024948025, "grad_norm": 0.2668364346027374, "learning_rate": 4.80559888751981e-05, "loss": 0.1716, "num_input_tokens_seen": 1568336, "step": 8225 }, { "epoch": 4.277546777546777, "grad_norm": 1.0387356281280518, "learning_rate": 4.805160255533897e-05, "loss": 0.1498, "num_input_tokens_seen": 1569264, "step": 8230 }, { "epoch": 4.28014553014553, "grad_norm": 1.2487242221832275, "learning_rate": 4.8047211493215874e-05, "loss": 0.1743, "num_input_tokens_seen": 1570320, "step": 8235 }, { "epoch": 4.282744282744282, "grad_norm": 0.42895281314849854, "learning_rate": 4.8042815689732154e-05, "loss": 0.0766, "num_input_tokens_seen": 1571280, "step": 8240 }, { "epoch": 4.285343035343035, "grad_norm": 0.5550501942634583, "learning_rate": 4.803841514579215e-05, "loss": 0.1215, "num_input_tokens_seen": 1572176, "step": 8245 }, { "epoch": 4.287941787941788, "grad_norm": 0.4010031819343567, "learning_rate": 4.8034009862301135e-05, "loss": 0.0939, "num_input_tokens_seen": 1573232, "step": 8250 }, { "epoch": 4.29054054054054, "grad_norm": 0.592365026473999, "learning_rate": 4.8029599840165396e-05, "loss": 0.1926, "num_input_tokens_seen": 1574096, "step": 8255 }, { "epoch": 4.293139293139293, "grad_norm": 1.3449885845184326, "learning_rate": 4.8025185080292165e-05, "loss": 0.1447, "num_input_tokens_seen": 1575056, "step": 8260 }, { "epoch": 4.295738045738045, "grad_norm": 1.117030143737793, "learning_rate": 4.8020765583589685e-05, "loss": 0.214, "num_input_tokens_seen": 1576048, "step": 8265 }, { "epoch": 4.298336798336798, "grad_norm": 1.3157947063446045, "learning_rate": 4.801634135096714e-05, "loss": 0.2092, "num_input_tokens_seen": 1577008, "step": 8270 }, { "epoch": 4.3009355509355505, "grad_norm": 1.132685899734497, "learning_rate": 4.80119123833347e-05, "loss": 0.1678, "num_input_tokens_seen": 1577936, "step": 8275 }, { "epoch": 4.303534303534303, "grad_norm": 0.40384289622306824, "learning_rate": 4.80074786816035e-05, "loss": 0.1236, "num_input_tokens_seen": 1578864, "step": 8280 }, { "epoch": 4.306133056133056, "grad_norm": 0.6384860277175903, "learning_rate": 4.800304024668568e-05, "loss": 0.1441, "num_input_tokens_seen": 1579920, "step": 8285 }, { "epoch": 4.3087318087318085, "grad_norm": 0.12292313575744629, "learning_rate": 4.799859707949431e-05, "loss": 0.1, "num_input_tokens_seen": 1580848, "step": 8290 }, { "epoch": 4.3113305613305615, "grad_norm": 0.21922524273395538, "learning_rate": 4.799414918094347e-05, "loss": 0.1046, "num_input_tokens_seen": 1581872, "step": 8295 }, { "epoch": 4.313929313929314, "grad_norm": 1.3097202777862549, "learning_rate": 4.798969655194819e-05, "loss": 0.156, "num_input_tokens_seen": 1582800, "step": 8300 }, { "epoch": 4.3165280665280665, "grad_norm": 0.8265552520751953, "learning_rate": 4.798523919342449e-05, "loss": 0.1918, "num_input_tokens_seen": 1583760, "step": 8305 }, { "epoch": 4.3191268191268195, "grad_norm": 0.11535534262657166, "learning_rate": 4.798077710628935e-05, "loss": 0.2324, "num_input_tokens_seen": 1584720, "step": 8310 }, { "epoch": 4.321725571725572, "grad_norm": 0.7449083924293518, "learning_rate": 4.797631029146072e-05, "loss": 0.2627, "num_input_tokens_seen": 1585680, "step": 8315 }, { "epoch": 4.324324324324325, "grad_norm": 0.14947642385959625, "learning_rate": 4.797183874985755e-05, "loss": 0.1037, "num_input_tokens_seen": 1586640, "step": 8320 }, { "epoch": 4.326923076923077, "grad_norm": 1.2141748666763306, "learning_rate": 4.7967362482399724e-05, "loss": 0.1076, "num_input_tokens_seen": 1587504, "step": 8325 }, { "epoch": 4.32952182952183, "grad_norm": 0.664212703704834, "learning_rate": 4.796288149000813e-05, "loss": 0.1233, "num_input_tokens_seen": 1588432, "step": 8330 }, { "epoch": 4.332120582120582, "grad_norm": 2.6286532878875732, "learning_rate": 4.79583957736046e-05, "loss": 0.3126, "num_input_tokens_seen": 1589424, "step": 8335 }, { "epoch": 4.334719334719335, "grad_norm": 0.6333224773406982, "learning_rate": 4.795390533411197e-05, "loss": 0.0638, "num_input_tokens_seen": 1590352, "step": 8340 }, { "epoch": 4.337318087318088, "grad_norm": 0.6099389791488647, "learning_rate": 4.794941017245401e-05, "loss": 0.0712, "num_input_tokens_seen": 1591280, "step": 8345 }, { "epoch": 4.33991683991684, "grad_norm": 0.9492485523223877, "learning_rate": 4.7944910289555506e-05, "loss": 0.0683, "num_input_tokens_seen": 1592304, "step": 8350 }, { "epoch": 4.342515592515593, "grad_norm": 0.6610191464424133, "learning_rate": 4.794040568634217e-05, "loss": 0.1741, "num_input_tokens_seen": 1593296, "step": 8355 }, { "epoch": 4.345114345114345, "grad_norm": 1.8421063423156738, "learning_rate": 4.793589636374072e-05, "loss": 0.1806, "num_input_tokens_seen": 1594128, "step": 8360 }, { "epoch": 4.347713097713098, "grad_norm": 0.17574843764305115, "learning_rate": 4.793138232267883e-05, "loss": 0.0999, "num_input_tokens_seen": 1595088, "step": 8365 }, { "epoch": 4.350311850311851, "grad_norm": 1.2601463794708252, "learning_rate": 4.792686356408513e-05, "loss": 0.2017, "num_input_tokens_seen": 1596016, "step": 8370 }, { "epoch": 4.352910602910603, "grad_norm": 1.437887191772461, "learning_rate": 4.792234008888926e-05, "loss": 0.2505, "num_input_tokens_seen": 1596944, "step": 8375 }, { "epoch": 4.355509355509356, "grad_norm": 0.7803478240966797, "learning_rate": 4.791781189802179e-05, "loss": 0.116, "num_input_tokens_seen": 1597968, "step": 8380 }, { "epoch": 4.358108108108108, "grad_norm": 0.9091765284538269, "learning_rate": 4.791327899241428e-05, "loss": 0.1435, "num_input_tokens_seen": 1598864, "step": 8385 }, { "epoch": 4.360706860706861, "grad_norm": 0.09373755007982254, "learning_rate": 4.790874137299927e-05, "loss": 0.128, "num_input_tokens_seen": 1599760, "step": 8390 }, { "epoch": 4.363305613305613, "grad_norm": 0.6442629098892212, "learning_rate": 4.790419904071023e-05, "loss": 0.1039, "num_input_tokens_seen": 1600656, "step": 8395 }, { "epoch": 4.365904365904366, "grad_norm": 0.6649347543716431, "learning_rate": 4.7899651996481656e-05, "loss": 0.1546, "num_input_tokens_seen": 1601616, "step": 8400 }, { "epoch": 4.368503118503119, "grad_norm": 0.8873545527458191, "learning_rate": 4.789510024124895e-05, "loss": 0.1749, "num_input_tokens_seen": 1602544, "step": 8405 }, { "epoch": 4.371101871101871, "grad_norm": 0.907437264919281, "learning_rate": 4.789054377594855e-05, "loss": 0.1212, "num_input_tokens_seen": 1603504, "step": 8410 }, { "epoch": 4.373700623700624, "grad_norm": 0.5437235832214355, "learning_rate": 4.788598260151781e-05, "loss": 0.2797, "num_input_tokens_seen": 1604464, "step": 8415 }, { "epoch": 4.376299376299376, "grad_norm": 0.6769498586654663, "learning_rate": 4.788141671889507e-05, "loss": 0.1082, "num_input_tokens_seen": 1605392, "step": 8420 }, { "epoch": 4.378898128898129, "grad_norm": 0.6181042194366455, "learning_rate": 4.787684612901965e-05, "loss": 0.0959, "num_input_tokens_seen": 1606352, "step": 8425 }, { "epoch": 4.381496881496881, "grad_norm": 0.19323451817035675, "learning_rate": 4.7872270832831834e-05, "loss": 0.0724, "num_input_tokens_seen": 1607344, "step": 8430 }, { "epoch": 4.384095634095634, "grad_norm": 1.5335944890975952, "learning_rate": 4.786769083127286e-05, "loss": 0.2071, "num_input_tokens_seen": 1608336, "step": 8435 }, { "epoch": 4.386694386694387, "grad_norm": 0.3128979504108429, "learning_rate": 4.786310612528494e-05, "loss": 0.0909, "num_input_tokens_seen": 1609328, "step": 8440 }, { "epoch": 4.389293139293139, "grad_norm": 0.6261016130447388, "learning_rate": 4.785851671581127e-05, "loss": 0.1698, "num_input_tokens_seen": 1610288, "step": 8445 }, { "epoch": 4.391891891891892, "grad_norm": 1.4061808586120605, "learning_rate": 4.7853922603795984e-05, "loss": 0.1373, "num_input_tokens_seen": 1611248, "step": 8450 }, { "epoch": 4.394490644490644, "grad_norm": 1.081445336341858, "learning_rate": 4.784932379018422e-05, "loss": 0.202, "num_input_tokens_seen": 1612208, "step": 8455 }, { "epoch": 4.397089397089397, "grad_norm": 0.9224331974983215, "learning_rate": 4.784472027592205e-05, "loss": 0.1299, "num_input_tokens_seen": 1613136, "step": 8460 }, { "epoch": 4.399688149688149, "grad_norm": 0.43582913279533386, "learning_rate": 4.7840112061956526e-05, "loss": 0.1273, "num_input_tokens_seen": 1614160, "step": 8465 }, { "epoch": 4.402286902286902, "grad_norm": 0.4555685222148895, "learning_rate": 4.7835499149235676e-05, "loss": 0.2251, "num_input_tokens_seen": 1615120, "step": 8470 }, { "epoch": 4.404885654885655, "grad_norm": 0.47449424862861633, "learning_rate": 4.783088153870848e-05, "loss": 0.2016, "num_input_tokens_seen": 1616144, "step": 8475 }, { "epoch": 4.407484407484407, "grad_norm": 0.8506744503974915, "learning_rate": 4.78262592313249e-05, "loss": 0.2442, "num_input_tokens_seen": 1617136, "step": 8480 }, { "epoch": 4.41008316008316, "grad_norm": 0.4427083134651184, "learning_rate": 4.782163222803584e-05, "loss": 0.1969, "num_input_tokens_seen": 1618064, "step": 8485 }, { "epoch": 4.412681912681912, "grad_norm": 0.7567692399024963, "learning_rate": 4.781700052979319e-05, "loss": 0.1849, "num_input_tokens_seen": 1619024, "step": 8490 }, { "epoch": 4.415280665280665, "grad_norm": 0.5793880820274353, "learning_rate": 4.78123641375498e-05, "loss": 0.1113, "num_input_tokens_seen": 1620112, "step": 8495 }, { "epoch": 4.417879417879418, "grad_norm": 1.6072183847427368, "learning_rate": 4.78077230522595e-05, "loss": 0.2273, "num_input_tokens_seen": 1621136, "step": 8500 }, { "epoch": 4.42047817047817, "grad_norm": 0.2795169949531555, "learning_rate": 4.780307727487705e-05, "loss": 0.0658, "num_input_tokens_seen": 1622064, "step": 8505 }, { "epoch": 4.423076923076923, "grad_norm": 0.6740986704826355, "learning_rate": 4.779842680635821e-05, "loss": 0.1157, "num_input_tokens_seen": 1622992, "step": 8510 }, { "epoch": 4.425675675675675, "grad_norm": 1.1097112894058228, "learning_rate": 4.7793771647659705e-05, "loss": 0.1215, "num_input_tokens_seen": 1623920, "step": 8515 }, { "epoch": 4.428274428274428, "grad_norm": 0.4528648257255554, "learning_rate": 4.778911179973918e-05, "loss": 0.1805, "num_input_tokens_seen": 1624880, "step": 8520 }, { "epoch": 4.4308731808731805, "grad_norm": 0.5380620956420898, "learning_rate": 4.7784447263555296e-05, "loss": 0.1144, "num_input_tokens_seen": 1625840, "step": 8525 }, { "epoch": 4.4334719334719335, "grad_norm": 0.18595990538597107, "learning_rate": 4.777977804006766e-05, "loss": 0.1159, "num_input_tokens_seen": 1626768, "step": 8530 }, { "epoch": 4.436070686070686, "grad_norm": 1.3779666423797607, "learning_rate": 4.777510413023684e-05, "loss": 0.2362, "num_input_tokens_seen": 1627696, "step": 8535 }, { "epoch": 4.4386694386694385, "grad_norm": 0.4602910280227661, "learning_rate": 4.777042553502437e-05, "loss": 0.233, "num_input_tokens_seen": 1628624, "step": 8540 }, { "epoch": 4.4412681912681915, "grad_norm": 1.2908869981765747, "learning_rate": 4.776574225539276e-05, "loss": 0.1058, "num_input_tokens_seen": 1629520, "step": 8545 }, { "epoch": 4.443866943866944, "grad_norm": 0.4581542909145355, "learning_rate": 4.776105429230544e-05, "loss": 0.1447, "num_input_tokens_seen": 1630448, "step": 8550 }, { "epoch": 4.446465696465697, "grad_norm": 1.1698346138000488, "learning_rate": 4.775636164672687e-05, "loss": 0.1463, "num_input_tokens_seen": 1631408, "step": 8555 }, { "epoch": 4.4490644490644495, "grad_norm": 0.18383647501468658, "learning_rate": 4.775166431962242e-05, "loss": 0.0359, "num_input_tokens_seen": 1632400, "step": 8560 }, { "epoch": 4.451663201663202, "grad_norm": 0.7866418957710266, "learning_rate": 4.774696231195845e-05, "loss": 0.1557, "num_input_tokens_seen": 1633264, "step": 8565 }, { "epoch": 4.454261954261955, "grad_norm": 0.7798214554786682, "learning_rate": 4.774225562470228e-05, "loss": 0.1074, "num_input_tokens_seen": 1634256, "step": 8570 }, { "epoch": 4.456860706860707, "grad_norm": 0.8157163858413696, "learning_rate": 4.773754425882217e-05, "loss": 0.1857, "num_input_tokens_seen": 1635184, "step": 8575 }, { "epoch": 4.45945945945946, "grad_norm": 0.45219334959983826, "learning_rate": 4.773282821528737e-05, "loss": 0.0938, "num_input_tokens_seen": 1636112, "step": 8580 }, { "epoch": 4.462058212058212, "grad_norm": 0.2830561697483063, "learning_rate": 4.77281074950681e-05, "loss": 0.101, "num_input_tokens_seen": 1637104, "step": 8585 }, { "epoch": 4.464656964656965, "grad_norm": 0.7981629371643066, "learning_rate": 4.772338209913549e-05, "loss": 0.1438, "num_input_tokens_seen": 1638032, "step": 8590 }, { "epoch": 4.467255717255718, "grad_norm": 0.6767458319664001, "learning_rate": 4.77186520284617e-05, "loss": 0.2291, "num_input_tokens_seen": 1638960, "step": 8595 }, { "epoch": 4.46985446985447, "grad_norm": 0.6022239327430725, "learning_rate": 4.771391728401979e-05, "loss": 0.0938, "num_input_tokens_seen": 1639952, "step": 8600 }, { "epoch": 4.472453222453223, "grad_norm": 0.47920846939086914, "learning_rate": 4.770917786678383e-05, "loss": 0.0875, "num_input_tokens_seen": 1640880, "step": 8605 }, { "epoch": 4.475051975051975, "grad_norm": 0.6824890375137329, "learning_rate": 4.770443377772882e-05, "loss": 0.2202, "num_input_tokens_seen": 1641808, "step": 8610 }, { "epoch": 4.477650727650728, "grad_norm": 0.7440101504325867, "learning_rate": 4.7699685017830734e-05, "loss": 0.1243, "num_input_tokens_seen": 1642832, "step": 8615 }, { "epoch": 4.48024948024948, "grad_norm": 0.1713937371969223, "learning_rate": 4.769493158806651e-05, "loss": 0.2019, "num_input_tokens_seen": 1643824, "step": 8620 }, { "epoch": 4.482848232848233, "grad_norm": 0.6000452637672424, "learning_rate": 4.769017348941404e-05, "loss": 0.1505, "num_input_tokens_seen": 1644720, "step": 8625 }, { "epoch": 4.485446985446986, "grad_norm": 1.3577752113342285, "learning_rate": 4.7685410722852174e-05, "loss": 0.2057, "num_input_tokens_seen": 1645680, "step": 8630 }, { "epoch": 4.488045738045738, "grad_norm": 0.24875248968601227, "learning_rate": 4.7680643289360736e-05, "loss": 0.0822, "num_input_tokens_seen": 1646736, "step": 8635 }, { "epoch": 4.490644490644491, "grad_norm": 1.097732424736023, "learning_rate": 4.767587118992048e-05, "loss": 0.158, "num_input_tokens_seen": 1647696, "step": 8640 }, { "epoch": 4.493243243243243, "grad_norm": 1.139163613319397, "learning_rate": 4.7671094425513176e-05, "loss": 0.1196, "num_input_tokens_seen": 1648688, "step": 8645 }, { "epoch": 4.495841995841996, "grad_norm": 1.4794440269470215, "learning_rate": 4.766631299712148e-05, "loss": 0.1214, "num_input_tokens_seen": 1649680, "step": 8650 }, { "epoch": 4.498440748440748, "grad_norm": 0.09050153195858002, "learning_rate": 4.766152690572907e-05, "loss": 0.0947, "num_input_tokens_seen": 1650640, "step": 8655 }, { "epoch": 4.501039501039501, "grad_norm": 0.19736115634441376, "learning_rate": 4.765673615232055e-05, "loss": 0.2341, "num_input_tokens_seen": 1651632, "step": 8660 }, { "epoch": 4.503638253638254, "grad_norm": 0.25868406891822815, "learning_rate": 4.7651940737881496e-05, "loss": 0.0982, "num_input_tokens_seen": 1652656, "step": 8665 }, { "epoch": 4.506237006237006, "grad_norm": 0.05706464499235153, "learning_rate": 4.7647140663398436e-05, "loss": 0.2295, "num_input_tokens_seen": 1653648, "step": 8670 }, { "epoch": 4.508835758835759, "grad_norm": 1.6958240270614624, "learning_rate": 4.764233592985886e-05, "loss": 0.3369, "num_input_tokens_seen": 1654672, "step": 8675 }, { "epoch": 4.511434511434511, "grad_norm": 2.0961546897888184, "learning_rate": 4.7637526538251224e-05, "loss": 0.289, "num_input_tokens_seen": 1655632, "step": 8680 }, { "epoch": 4.514033264033264, "grad_norm": 0.29743683338165283, "learning_rate": 4.7632712489564926e-05, "loss": 0.1564, "num_input_tokens_seen": 1656656, "step": 8685 }, { "epoch": 4.516632016632016, "grad_norm": 0.8403484225273132, "learning_rate": 4.762789378479032e-05, "loss": 0.1214, "num_input_tokens_seen": 1657616, "step": 8690 }, { "epoch": 4.519230769230769, "grad_norm": 0.7732468247413635, "learning_rate": 4.7623070424918746e-05, "loss": 0.1666, "num_input_tokens_seen": 1658544, "step": 8695 }, { "epoch": 4.521829521829522, "grad_norm": 0.3749888837337494, "learning_rate": 4.761824241094248e-05, "loss": 0.0877, "num_input_tokens_seen": 1659504, "step": 8700 }, { "epoch": 4.524428274428274, "grad_norm": 0.5891456007957458, "learning_rate": 4.7613409743854765e-05, "loss": 0.1039, "num_input_tokens_seen": 1660432, "step": 8705 }, { "epoch": 4.527027027027027, "grad_norm": 0.7854650020599365, "learning_rate": 4.760857242464978e-05, "loss": 0.2629, "num_input_tokens_seen": 1661328, "step": 8710 }, { "epoch": 4.529625779625779, "grad_norm": 0.6480792164802551, "learning_rate": 4.760373045432269e-05, "loss": 0.1348, "num_input_tokens_seen": 1662256, "step": 8715 }, { "epoch": 4.532224532224532, "grad_norm": 1.5263184309005737, "learning_rate": 4.7598883833869604e-05, "loss": 0.2039, "num_input_tokens_seen": 1663280, "step": 8720 }, { "epoch": 4.534823284823284, "grad_norm": 1.0111067295074463, "learning_rate": 4.759403256428757e-05, "loss": 0.1084, "num_input_tokens_seen": 1664272, "step": 8725 }, { "epoch": 4.537422037422037, "grad_norm": 0.5339885354042053, "learning_rate": 4.758917664657463e-05, "loss": 0.119, "num_input_tokens_seen": 1665168, "step": 8730 }, { "epoch": 4.54002079002079, "grad_norm": 0.10520847886800766, "learning_rate": 4.758431608172975e-05, "loss": 0.2045, "num_input_tokens_seen": 1666128, "step": 8735 }, { "epoch": 4.542619542619542, "grad_norm": 1.3007911443710327, "learning_rate": 4.7579450870752876e-05, "loss": 0.1142, "num_input_tokens_seen": 1667088, "step": 8740 }, { "epoch": 4.545218295218295, "grad_norm": 0.4158974289894104, "learning_rate": 4.757458101464488e-05, "loss": 0.0961, "num_input_tokens_seen": 1668048, "step": 8745 }, { "epoch": 4.547817047817047, "grad_norm": 0.45577749609947205, "learning_rate": 4.7569706514407625e-05, "loss": 0.0649, "num_input_tokens_seen": 1669008, "step": 8750 }, { "epoch": 4.5504158004158, "grad_norm": 0.5637729167938232, "learning_rate": 4.7564827371043904e-05, "loss": 0.0809, "num_input_tokens_seen": 1669936, "step": 8755 }, { "epoch": 4.553014553014553, "grad_norm": 0.16740261018276215, "learning_rate": 4.755994358555747e-05, "loss": 0.1935, "num_input_tokens_seen": 1671088, "step": 8760 }, { "epoch": 4.5556133056133055, "grad_norm": 0.5505849719047546, "learning_rate": 4.755505515895304e-05, "loss": 0.1056, "num_input_tokens_seen": 1671952, "step": 8765 }, { "epoch": 4.558212058212058, "grad_norm": 1.4452227354049683, "learning_rate": 4.7550162092236284e-05, "loss": 0.1881, "num_input_tokens_seen": 1672912, "step": 8770 }, { "epoch": 4.5608108108108105, "grad_norm": 0.8117442727088928, "learning_rate": 4.75452643864138e-05, "loss": 0.0967, "num_input_tokens_seen": 1673904, "step": 8775 }, { "epoch": 4.5634095634095635, "grad_norm": 0.2544230818748474, "learning_rate": 4.754036204249319e-05, "loss": 0.0874, "num_input_tokens_seen": 1674832, "step": 8780 }, { "epoch": 4.5660083160083165, "grad_norm": 0.47352316975593567, "learning_rate": 4.7535455061482973e-05, "loss": 0.1111, "num_input_tokens_seen": 1675792, "step": 8785 }, { "epoch": 4.5686070686070686, "grad_norm": 1.2747931480407715, "learning_rate": 4.753054344439263e-05, "loss": 0.1418, "num_input_tokens_seen": 1676720, "step": 8790 }, { "epoch": 4.5712058212058215, "grad_norm": 0.6647705435752869, "learning_rate": 4.7525627192232594e-05, "loss": 0.1774, "num_input_tokens_seen": 1677744, "step": 8795 }, { "epoch": 4.573804573804574, "grad_norm": 0.20847062766551971, "learning_rate": 4.7520706306014264e-05, "loss": 0.1633, "num_input_tokens_seen": 1678800, "step": 8800 }, { "epoch": 4.576403326403327, "grad_norm": 0.3311695456504822, "learning_rate": 4.7515780786749974e-05, "loss": 0.17, "num_input_tokens_seen": 1679760, "step": 8805 }, { "epoch": 4.579002079002079, "grad_norm": 0.33152079582214355, "learning_rate": 4.751085063545303e-05, "loss": 0.2414, "num_input_tokens_seen": 1680720, "step": 8810 }, { "epoch": 4.581600831600832, "grad_norm": 0.47776734828948975, "learning_rate": 4.750591585313767e-05, "loss": 0.1308, "num_input_tokens_seen": 1681648, "step": 8815 }, { "epoch": 4.584199584199585, "grad_norm": 0.36150816082954407, "learning_rate": 4.750097644081911e-05, "loss": 0.1024, "num_input_tokens_seen": 1682608, "step": 8820 }, { "epoch": 4.586798336798337, "grad_norm": 0.5027714967727661, "learning_rate": 4.74960323995135e-05, "loss": 0.0759, "num_input_tokens_seen": 1683536, "step": 8825 }, { "epoch": 4.58939708939709, "grad_norm": 1.8558200597763062, "learning_rate": 4.749108373023794e-05, "loss": 0.1921, "num_input_tokens_seen": 1684528, "step": 8830 }, { "epoch": 4.591995841995842, "grad_norm": 0.8220311403274536, "learning_rate": 4.748613043401051e-05, "loss": 0.1065, "num_input_tokens_seen": 1685552, "step": 8835 }, { "epoch": 4.594594594594595, "grad_norm": 1.4065563678741455, "learning_rate": 4.748117251185019e-05, "loss": 0.1577, "num_input_tokens_seen": 1686416, "step": 8840 }, { "epoch": 4.597193347193347, "grad_norm": 0.5230646729469299, "learning_rate": 4.747620996477696e-05, "loss": 0.1812, "num_input_tokens_seen": 1687376, "step": 8845 }, { "epoch": 4.5997920997921, "grad_norm": 0.7920271158218384, "learning_rate": 4.747124279381174e-05, "loss": 0.1249, "num_input_tokens_seen": 1688336, "step": 8850 }, { "epoch": 4.602390852390853, "grad_norm": 0.6119995713233948, "learning_rate": 4.746627099997638e-05, "loss": 0.1999, "num_input_tokens_seen": 1689264, "step": 8855 }, { "epoch": 4.604989604989605, "grad_norm": 0.9139947295188904, "learning_rate": 4.746129458429371e-05, "loss": 0.0674, "num_input_tokens_seen": 1690160, "step": 8860 }, { "epoch": 4.607588357588358, "grad_norm": 1.3066202402114868, "learning_rate": 4.745631354778749e-05, "loss": 0.311, "num_input_tokens_seen": 1691120, "step": 8865 }, { "epoch": 4.61018711018711, "grad_norm": 0.3695688247680664, "learning_rate": 4.745132789148244e-05, "loss": 0.0572, "num_input_tokens_seen": 1692016, "step": 8870 }, { "epoch": 4.612785862785863, "grad_norm": 0.9717716574668884, "learning_rate": 4.7446337616404234e-05, "loss": 0.2714, "num_input_tokens_seen": 1693040, "step": 8875 }, { "epoch": 4.615384615384615, "grad_norm": 1.1866720914840698, "learning_rate": 4.744134272357948e-05, "loss": 0.2099, "num_input_tokens_seen": 1693936, "step": 8880 }, { "epoch": 4.617983367983368, "grad_norm": 0.4004279673099518, "learning_rate": 4.743634321403575e-05, "loss": 0.092, "num_input_tokens_seen": 1694928, "step": 8885 }, { "epoch": 4.620582120582121, "grad_norm": 1.0200234651565552, "learning_rate": 4.743133908880156e-05, "loss": 0.2541, "num_input_tokens_seen": 1695888, "step": 8890 }, { "epoch": 4.623180873180873, "grad_norm": 0.747031033039093, "learning_rate": 4.742633034890639e-05, "loss": 0.1414, "num_input_tokens_seen": 1696848, "step": 8895 }, { "epoch": 4.625779625779626, "grad_norm": 0.6748170852661133, "learning_rate": 4.742131699538065e-05, "loss": 0.1514, "num_input_tokens_seen": 1697712, "step": 8900 }, { "epoch": 4.628378378378378, "grad_norm": 0.7510544061660767, "learning_rate": 4.74162990292557e-05, "loss": 0.1457, "num_input_tokens_seen": 1698704, "step": 8905 }, { "epoch": 4.630977130977131, "grad_norm": 0.19457632303237915, "learning_rate": 4.7411276451563854e-05, "loss": 0.1217, "num_input_tokens_seen": 1699664, "step": 8910 }, { "epoch": 4.633575883575883, "grad_norm": 1.0318939685821533, "learning_rate": 4.740624926333839e-05, "loss": 0.1503, "num_input_tokens_seen": 1700624, "step": 8915 }, { "epoch": 4.636174636174636, "grad_norm": 0.7010712027549744, "learning_rate": 4.7401217465613506e-05, "loss": 0.1988, "num_input_tokens_seen": 1701520, "step": 8920 }, { "epoch": 4.638773388773389, "grad_norm": 0.16164380311965942, "learning_rate": 4.739618105942437e-05, "loss": 0.0789, "num_input_tokens_seen": 1702480, "step": 8925 }, { "epoch": 4.641372141372141, "grad_norm": 0.5214368104934692, "learning_rate": 4.739114004580709e-05, "loss": 0.1328, "num_input_tokens_seen": 1703472, "step": 8930 }, { "epoch": 4.643970893970894, "grad_norm": 0.46385395526885986, "learning_rate": 4.738609442579873e-05, "loss": 0.1746, "num_input_tokens_seen": 1704432, "step": 8935 }, { "epoch": 4.646569646569646, "grad_norm": 0.32250165939331055, "learning_rate": 4.738104420043728e-05, "loss": 0.1419, "num_input_tokens_seen": 1705360, "step": 8940 }, { "epoch": 4.649168399168399, "grad_norm": 0.08903876692056656, "learning_rate": 4.7375989370761695e-05, "loss": 0.0623, "num_input_tokens_seen": 1706320, "step": 8945 }, { "epoch": 4.651767151767151, "grad_norm": 1.5396459102630615, "learning_rate": 4.7370929937811884e-05, "loss": 0.1369, "num_input_tokens_seen": 1707248, "step": 8950 }, { "epoch": 4.654365904365904, "grad_norm": 0.8380807638168335, "learning_rate": 4.7365865902628684e-05, "loss": 0.1702, "num_input_tokens_seen": 1708176, "step": 8955 }, { "epoch": 4.656964656964657, "grad_norm": 1.3143610954284668, "learning_rate": 4.736079726625389e-05, "loss": 0.1745, "num_input_tokens_seen": 1709072, "step": 8960 }, { "epoch": 4.659563409563409, "grad_norm": 0.6162500977516174, "learning_rate": 4.7355724029730243e-05, "loss": 0.2269, "num_input_tokens_seen": 1709968, "step": 8965 }, { "epoch": 4.662162162162162, "grad_norm": 2.570538282394409, "learning_rate": 4.735064619410142e-05, "loss": 0.2147, "num_input_tokens_seen": 1710896, "step": 8970 }, { "epoch": 4.664760914760915, "grad_norm": 0.7011119723320007, "learning_rate": 4.734556376041207e-05, "loss": 0.1476, "num_input_tokens_seen": 1711856, "step": 8975 }, { "epoch": 4.667359667359667, "grad_norm": 0.5282407999038696, "learning_rate": 4.734047672970776e-05, "loss": 0.1436, "num_input_tokens_seen": 1712752, "step": 8980 }, { "epoch": 4.66995841995842, "grad_norm": 0.18952976167201996, "learning_rate": 4.7335385103035e-05, "loss": 0.1048, "num_input_tokens_seen": 1713712, "step": 8985 }, { "epoch": 4.672557172557172, "grad_norm": 0.7816908955574036, "learning_rate": 4.7330288881441286e-05, "loss": 0.1629, "num_input_tokens_seen": 1714704, "step": 8990 }, { "epoch": 4.675155925155925, "grad_norm": 0.5610539317131042, "learning_rate": 4.732518806597501e-05, "loss": 0.2061, "num_input_tokens_seen": 1715632, "step": 8995 }, { "epoch": 4.6777546777546775, "grad_norm": 0.4232260286808014, "learning_rate": 4.732008265768555e-05, "loss": 0.1317, "num_input_tokens_seen": 1716560, "step": 9000 }, { "epoch": 4.68035343035343, "grad_norm": 0.7930416464805603, "learning_rate": 4.7314972657623194e-05, "loss": 0.1736, "num_input_tokens_seen": 1717552, "step": 9005 }, { "epoch": 4.682952182952183, "grad_norm": 0.2838062047958374, "learning_rate": 4.73098580668392e-05, "loss": 0.0976, "num_input_tokens_seen": 1718512, "step": 9010 }, { "epoch": 4.6855509355509355, "grad_norm": 0.8826428055763245, "learning_rate": 4.730473888638576e-05, "loss": 0.153, "num_input_tokens_seen": 1719408, "step": 9015 }, { "epoch": 4.6881496881496885, "grad_norm": 0.7016394734382629, "learning_rate": 4.7299615117316e-05, "loss": 0.2162, "num_input_tokens_seen": 1720336, "step": 9020 }, { "epoch": 4.6907484407484406, "grad_norm": 0.5031372308731079, "learning_rate": 4.729448676068402e-05, "loss": 0.2067, "num_input_tokens_seen": 1721296, "step": 9025 }, { "epoch": 4.6933471933471935, "grad_norm": 0.22194689512252808, "learning_rate": 4.7289353817544837e-05, "loss": 0.0675, "num_input_tokens_seen": 1722224, "step": 9030 }, { "epoch": 4.695945945945946, "grad_norm": 1.0169156789779663, "learning_rate": 4.728421628895441e-05, "loss": 0.1763, "num_input_tokens_seen": 1723152, "step": 9035 }, { "epoch": 4.698544698544699, "grad_norm": 0.5238800644874573, "learning_rate": 4.7279074175969667e-05, "loss": 0.099, "num_input_tokens_seen": 1724016, "step": 9040 }, { "epoch": 4.701143451143452, "grad_norm": 0.7936383485794067, "learning_rate": 4.727392747964846e-05, "loss": 0.175, "num_input_tokens_seen": 1725008, "step": 9045 }, { "epoch": 4.703742203742204, "grad_norm": 1.0842647552490234, "learning_rate": 4.726877620104958e-05, "loss": 0.1256, "num_input_tokens_seen": 1725936, "step": 9050 }, { "epoch": 4.706340956340957, "grad_norm": 1.1579145193099976, "learning_rate": 4.726362034123276e-05, "loss": 0.1656, "num_input_tokens_seen": 1726864, "step": 9055 }, { "epoch": 4.708939708939709, "grad_norm": 1.3188310861587524, "learning_rate": 4.725845990125871e-05, "loss": 0.2439, "num_input_tokens_seen": 1727824, "step": 9060 }, { "epoch": 4.711538461538462, "grad_norm": 0.43847358226776123, "learning_rate": 4.725329488218904e-05, "loss": 0.0746, "num_input_tokens_seen": 1728816, "step": 9065 }, { "epoch": 4.714137214137214, "grad_norm": 1.1567747592926025, "learning_rate": 4.724812528508631e-05, "loss": 0.2586, "num_input_tokens_seen": 1729712, "step": 9070 }, { "epoch": 4.716735966735967, "grad_norm": 1.2378814220428467, "learning_rate": 4.7242951111014036e-05, "loss": 0.2106, "num_input_tokens_seen": 1730704, "step": 9075 }, { "epoch": 4.71933471933472, "grad_norm": 0.4755956530570984, "learning_rate": 4.723777236103667e-05, "loss": 0.1428, "num_input_tokens_seen": 1731664, "step": 9080 }, { "epoch": 4.721933471933472, "grad_norm": 0.5534799695014954, "learning_rate": 4.723258903621961e-05, "loss": 0.3269, "num_input_tokens_seen": 1732624, "step": 9085 }, { "epoch": 4.724532224532225, "grad_norm": 0.5792751908302307, "learning_rate": 4.722740113762918e-05, "loss": 0.1185, "num_input_tokens_seen": 1733584, "step": 9090 }, { "epoch": 4.727130977130977, "grad_norm": 0.7013493180274963, "learning_rate": 4.7222208666332656e-05, "loss": 0.1026, "num_input_tokens_seen": 1734512, "step": 9095 }, { "epoch": 4.72972972972973, "grad_norm": 0.6409620046615601, "learning_rate": 4.7217011623398254e-05, "loss": 0.1358, "num_input_tokens_seen": 1735440, "step": 9100 }, { "epoch": 4.732328482328482, "grad_norm": 0.3705820143222809, "learning_rate": 4.721181000989513e-05, "loss": 0.1001, "num_input_tokens_seen": 1736496, "step": 9105 }, { "epoch": 4.734927234927235, "grad_norm": 0.5206118226051331, "learning_rate": 4.720660382689339e-05, "loss": 0.1199, "num_input_tokens_seen": 1737552, "step": 9110 }, { "epoch": 4.737525987525988, "grad_norm": 0.3887065351009369, "learning_rate": 4.720139307546405e-05, "loss": 0.2289, "num_input_tokens_seen": 1738480, "step": 9115 }, { "epoch": 4.74012474012474, "grad_norm": 0.31175661087036133, "learning_rate": 4.719617775667909e-05, "loss": 0.1909, "num_input_tokens_seen": 1739504, "step": 9120 }, { "epoch": 4.742723492723493, "grad_norm": 0.7034933567047119, "learning_rate": 4.719095787161145e-05, "loss": 0.1471, "num_input_tokens_seen": 1740432, "step": 9125 }, { "epoch": 4.745322245322245, "grad_norm": 0.4432564079761505, "learning_rate": 4.718573342133495e-05, "loss": 0.1488, "num_input_tokens_seen": 1741328, "step": 9130 }, { "epoch": 4.747920997920998, "grad_norm": 0.23324789106845856, "learning_rate": 4.718050440692441e-05, "loss": 0.1521, "num_input_tokens_seen": 1742320, "step": 9135 }, { "epoch": 4.75051975051975, "grad_norm": 1.410588264465332, "learning_rate": 4.717527082945554e-05, "loss": 0.237, "num_input_tokens_seen": 1743248, "step": 9140 }, { "epoch": 4.753118503118503, "grad_norm": 0.6384000778198242, "learning_rate": 4.717003269000504e-05, "loss": 0.0535, "num_input_tokens_seen": 1744240, "step": 9145 }, { "epoch": 4.755717255717256, "grad_norm": 0.8386662602424622, "learning_rate": 4.7164789989650496e-05, "loss": 0.2094, "num_input_tokens_seen": 1745200, "step": 9150 }, { "epoch": 4.758316008316008, "grad_norm": 1.0153921842575073, "learning_rate": 4.715954272947046e-05, "loss": 0.0765, "num_input_tokens_seen": 1746096, "step": 9155 }, { "epoch": 4.760914760914761, "grad_norm": 0.4672171473503113, "learning_rate": 4.715429091054443e-05, "loss": 0.1124, "num_input_tokens_seen": 1747056, "step": 9160 }, { "epoch": 4.763513513513513, "grad_norm": 0.784618079662323, "learning_rate": 4.714903453395283e-05, "loss": 0.1245, "num_input_tokens_seen": 1747984, "step": 9165 }, { "epoch": 4.766112266112266, "grad_norm": 1.038323163986206, "learning_rate": 4.714377360077701e-05, "loss": 0.1735, "num_input_tokens_seen": 1748976, "step": 9170 }, { "epoch": 4.768711018711018, "grad_norm": 0.49282166361808777, "learning_rate": 4.7138508112099265e-05, "loss": 0.1193, "num_input_tokens_seen": 1749872, "step": 9175 }, { "epoch": 4.771309771309771, "grad_norm": 1.0870449542999268, "learning_rate": 4.713323806900285e-05, "loss": 0.1243, "num_input_tokens_seen": 1750864, "step": 9180 }, { "epoch": 4.773908523908524, "grad_norm": 0.19393238425254822, "learning_rate": 4.712796347257193e-05, "loss": 0.0879, "num_input_tokens_seen": 1751792, "step": 9185 }, { "epoch": 4.776507276507276, "grad_norm": 0.57313472032547, "learning_rate": 4.712268432389162e-05, "loss": 0.0896, "num_input_tokens_seen": 1752880, "step": 9190 }, { "epoch": 4.779106029106029, "grad_norm": 0.2512226998806, "learning_rate": 4.711740062404795e-05, "loss": 0.0998, "num_input_tokens_seen": 1753840, "step": 9195 }, { "epoch": 4.781704781704782, "grad_norm": 0.8605040907859802, "learning_rate": 4.7112112374127926e-05, "loss": 0.1524, "num_input_tokens_seen": 1754768, "step": 9200 }, { "epoch": 4.784303534303534, "grad_norm": 0.36951810121536255, "learning_rate": 4.7106819575219444e-05, "loss": 0.2147, "num_input_tokens_seen": 1755728, "step": 9205 }, { "epoch": 4.786902286902287, "grad_norm": 0.21382583677768707, "learning_rate": 4.7101522228411375e-05, "loss": 0.2311, "num_input_tokens_seen": 1756688, "step": 9210 }, { "epoch": 4.789501039501039, "grad_norm": 1.2781835794448853, "learning_rate": 4.709622033479351e-05, "loss": 0.1923, "num_input_tokens_seen": 1757552, "step": 9215 }, { "epoch": 4.792099792099792, "grad_norm": 0.10959859192371368, "learning_rate": 4.709091389545657e-05, "loss": 0.2126, "num_input_tokens_seen": 1758544, "step": 9220 }, { "epoch": 4.794698544698544, "grad_norm": 1.2008075714111328, "learning_rate": 4.708560291149221e-05, "loss": 0.1773, "num_input_tokens_seen": 1759504, "step": 9225 }, { "epoch": 4.797297297297297, "grad_norm": 0.7974687218666077, "learning_rate": 4.708028738399304e-05, "loss": 0.1337, "num_input_tokens_seen": 1760496, "step": 9230 }, { "epoch": 4.79989604989605, "grad_norm": 0.18073561787605286, "learning_rate": 4.707496731405257e-05, "loss": 0.1454, "num_input_tokens_seen": 1761488, "step": 9235 }, { "epoch": 4.802494802494802, "grad_norm": 0.40279150009155273, "learning_rate": 4.706964270276528e-05, "loss": 0.0964, "num_input_tokens_seen": 1762480, "step": 9240 }, { "epoch": 4.805093555093555, "grad_norm": 0.18458321690559387, "learning_rate": 4.706431355122658e-05, "loss": 0.1549, "num_input_tokens_seen": 1763440, "step": 9245 }, { "epoch": 4.8076923076923075, "grad_norm": 0.05545327067375183, "learning_rate": 4.705897986053278e-05, "loss": 0.0814, "num_input_tokens_seen": 1764432, "step": 9250 }, { "epoch": 4.8102910602910605, "grad_norm": 0.5542370676994324, "learning_rate": 4.705364163178116e-05, "loss": 0.1459, "num_input_tokens_seen": 1765424, "step": 9255 }, { "epoch": 4.8128898128898125, "grad_norm": 0.5043593645095825, "learning_rate": 4.704829886606992e-05, "loss": 0.1729, "num_input_tokens_seen": 1766288, "step": 9260 }, { "epoch": 4.8154885654885655, "grad_norm": 0.3399135172367096, "learning_rate": 4.704295156449819e-05, "loss": 0.1829, "num_input_tokens_seen": 1767280, "step": 9265 }, { "epoch": 4.8180873180873185, "grad_norm": 1.097612977027893, "learning_rate": 4.7037599728166045e-05, "loss": 0.2701, "num_input_tokens_seen": 1768240, "step": 9270 }, { "epoch": 4.820686070686071, "grad_norm": 0.7488914728164673, "learning_rate": 4.703224335817448e-05, "loss": 0.1313, "num_input_tokens_seen": 1769136, "step": 9275 }, { "epoch": 4.8232848232848236, "grad_norm": 0.22164598107337952, "learning_rate": 4.702688245562542e-05, "loss": 0.1643, "num_input_tokens_seen": 1770064, "step": 9280 }, { "epoch": 4.825883575883576, "grad_norm": 0.3412473797798157, "learning_rate": 4.7021517021621746e-05, "loss": 0.1101, "num_input_tokens_seen": 1770992, "step": 9285 }, { "epoch": 4.828482328482329, "grad_norm": 0.3476795554161072, "learning_rate": 4.701614705726725e-05, "loss": 0.2305, "num_input_tokens_seen": 1771920, "step": 9290 }, { "epoch": 4.831081081081081, "grad_norm": 0.7074377536773682, "learning_rate": 4.701077256366666e-05, "loss": 0.2014, "num_input_tokens_seen": 1772784, "step": 9295 }, { "epoch": 4.833679833679834, "grad_norm": 0.32871776819229126, "learning_rate": 4.700539354192564e-05, "loss": 0.1703, "num_input_tokens_seen": 1773712, "step": 9300 }, { "epoch": 4.836278586278587, "grad_norm": 0.4341549277305603, "learning_rate": 4.700000999315077e-05, "loss": 0.0985, "num_input_tokens_seen": 1774640, "step": 9305 }, { "epoch": 4.838877338877339, "grad_norm": 0.7049396634101868, "learning_rate": 4.699462191844959e-05, "loss": 0.2218, "num_input_tokens_seen": 1775600, "step": 9310 }, { "epoch": 4.841476091476092, "grad_norm": 0.4648714065551758, "learning_rate": 4.698922931893055e-05, "loss": 0.1045, "num_input_tokens_seen": 1776592, "step": 9315 }, { "epoch": 4.844074844074844, "grad_norm": 0.5310875177383423, "learning_rate": 4.6983832195703035e-05, "loss": 0.1076, "num_input_tokens_seen": 1777552, "step": 9320 }, { "epoch": 4.846673596673597, "grad_norm": 0.22814297676086426, "learning_rate": 4.697843054987737e-05, "loss": 0.0446, "num_input_tokens_seen": 1778448, "step": 9325 }, { "epoch": 4.849272349272349, "grad_norm": 0.8136988878250122, "learning_rate": 4.697302438256479e-05, "loss": 0.217, "num_input_tokens_seen": 1779344, "step": 9330 }, { "epoch": 4.851871101871102, "grad_norm": 0.648968517780304, "learning_rate": 4.6967613694877474e-05, "loss": 0.1909, "num_input_tokens_seen": 1780336, "step": 9335 }, { "epoch": 4.854469854469855, "grad_norm": 0.07386104017496109, "learning_rate": 4.696219848792854e-05, "loss": 0.1411, "num_input_tokens_seen": 1781296, "step": 9340 }, { "epoch": 4.857068607068607, "grad_norm": 0.453887939453125, "learning_rate": 4.695677876283202e-05, "loss": 0.1954, "num_input_tokens_seen": 1782256, "step": 9345 }, { "epoch": 4.85966735966736, "grad_norm": 0.5115061402320862, "learning_rate": 4.695135452070288e-05, "loss": 0.0722, "num_input_tokens_seen": 1783216, "step": 9350 }, { "epoch": 4.862266112266112, "grad_norm": 1.5819424390792847, "learning_rate": 4.6945925762657e-05, "loss": 0.1584, "num_input_tokens_seen": 1784176, "step": 9355 }, { "epoch": 4.864864864864865, "grad_norm": 1.032603144645691, "learning_rate": 4.6940492489811236e-05, "loss": 0.1898, "num_input_tokens_seen": 1785104, "step": 9360 }, { "epoch": 4.867463617463617, "grad_norm": 0.13723640143871307, "learning_rate": 4.693505470328332e-05, "loss": 0.1702, "num_input_tokens_seen": 1786032, "step": 9365 }, { "epoch": 4.87006237006237, "grad_norm": 0.7049927115440369, "learning_rate": 4.692961240419195e-05, "loss": 0.1477, "num_input_tokens_seen": 1786992, "step": 9370 }, { "epoch": 4.872661122661123, "grad_norm": 0.11893205344676971, "learning_rate": 4.6924165593656714e-05, "loss": 0.1179, "num_input_tokens_seen": 1787952, "step": 9375 }, { "epoch": 4.875259875259875, "grad_norm": 0.9602859020233154, "learning_rate": 4.691871427279816e-05, "loss": 0.1517, "num_input_tokens_seen": 1788912, "step": 9380 }, { "epoch": 4.877858627858628, "grad_norm": 0.7575761675834656, "learning_rate": 4.691325844273777e-05, "loss": 0.1815, "num_input_tokens_seen": 1789808, "step": 9385 }, { "epoch": 4.88045738045738, "grad_norm": 0.34106341004371643, "learning_rate": 4.690779810459792e-05, "loss": 0.1369, "num_input_tokens_seen": 1790864, "step": 9390 }, { "epoch": 4.883056133056133, "grad_norm": 0.2212759554386139, "learning_rate": 4.690233325950194e-05, "loss": 0.2354, "num_input_tokens_seen": 1791824, "step": 9395 }, { "epoch": 4.885654885654886, "grad_norm": 1.0439410209655762, "learning_rate": 4.689686390857407e-05, "loss": 0.2171, "num_input_tokens_seen": 1792816, "step": 9400 }, { "epoch": 4.888253638253638, "grad_norm": 0.656947910785675, "learning_rate": 4.68913900529395e-05, "loss": 0.1402, "num_input_tokens_seen": 1793776, "step": 9405 }, { "epoch": 4.890852390852391, "grad_norm": 0.3290175497531891, "learning_rate": 4.6885911693724315e-05, "loss": 0.1568, "num_input_tokens_seen": 1794768, "step": 9410 }, { "epoch": 4.893451143451143, "grad_norm": 1.3393480777740479, "learning_rate": 4.6880428832055564e-05, "loss": 0.214, "num_input_tokens_seen": 1795824, "step": 9415 }, { "epoch": 4.896049896049896, "grad_norm": 1.143235206604004, "learning_rate": 4.687494146906119e-05, "loss": 0.2181, "num_input_tokens_seen": 1796720, "step": 9420 }, { "epoch": 4.898648648648649, "grad_norm": 0.8250828981399536, "learning_rate": 4.686944960587007e-05, "loss": 0.1246, "num_input_tokens_seen": 1797712, "step": 9425 }, { "epoch": 4.901247401247401, "grad_norm": 0.8603610992431641, "learning_rate": 4.6863953243612024e-05, "loss": 0.152, "num_input_tokens_seen": 1798704, "step": 9430 }, { "epoch": 4.903846153846154, "grad_norm": 0.20248110592365265, "learning_rate": 4.6858452383417786e-05, "loss": 0.0906, "num_input_tokens_seen": 1799664, "step": 9435 }, { "epoch": 4.906444906444906, "grad_norm": 0.6059004664421082, "learning_rate": 4.6852947026418993e-05, "loss": 0.1794, "num_input_tokens_seen": 1800592, "step": 9440 }, { "epoch": 4.909043659043659, "grad_norm": 0.5111360549926758, "learning_rate": 4.6847437173748246e-05, "loss": 0.2, "num_input_tokens_seen": 1801488, "step": 9445 }, { "epoch": 4.911642411642411, "grad_norm": 0.93937087059021, "learning_rate": 4.6841922826539056e-05, "loss": 0.1219, "num_input_tokens_seen": 1802416, "step": 9450 }, { "epoch": 4.914241164241164, "grad_norm": 1.227138876914978, "learning_rate": 4.683640398592584e-05, "loss": 0.3355, "num_input_tokens_seen": 1803280, "step": 9455 }, { "epoch": 4.916839916839917, "grad_norm": 1.0293231010437012, "learning_rate": 4.683088065304396e-05, "loss": 0.0855, "num_input_tokens_seen": 1804272, "step": 9460 }, { "epoch": 4.919438669438669, "grad_norm": 0.17705510556697845, "learning_rate": 4.6825352829029705e-05, "loss": 0.1233, "num_input_tokens_seen": 1805200, "step": 9465 }, { "epoch": 4.922037422037422, "grad_norm": 0.6934592723846436, "learning_rate": 4.6819820515020274e-05, "loss": 0.2062, "num_input_tokens_seen": 1806224, "step": 9470 }, { "epoch": 4.924636174636174, "grad_norm": 0.37744438648223877, "learning_rate": 4.68142837121538e-05, "loss": 0.1402, "num_input_tokens_seen": 1807152, "step": 9475 }, { "epoch": 4.927234927234927, "grad_norm": 0.8168502449989319, "learning_rate": 4.6808742421569334e-05, "loss": 0.1304, "num_input_tokens_seen": 1808112, "step": 9480 }, { "epoch": 4.9298336798336795, "grad_norm": 0.22696426510810852, "learning_rate": 4.680319664440685e-05, "loss": 0.1703, "num_input_tokens_seen": 1809072, "step": 9485 }, { "epoch": 4.9324324324324325, "grad_norm": 0.8793490529060364, "learning_rate": 4.679764638180724e-05, "loss": 0.1958, "num_input_tokens_seen": 1810032, "step": 9490 }, { "epoch": 4.935031185031185, "grad_norm": 0.5138303637504578, "learning_rate": 4.679209163491234e-05, "loss": 0.0901, "num_input_tokens_seen": 1810992, "step": 9495 }, { "epoch": 4.9376299376299375, "grad_norm": 0.39763012528419495, "learning_rate": 4.678653240486488e-05, "loss": 0.0611, "num_input_tokens_seen": 1811920, "step": 9500 }, { "epoch": 4.9402286902286905, "grad_norm": 0.10234834998846054, "learning_rate": 4.678096869280853e-05, "loss": 0.1351, "num_input_tokens_seen": 1812784, "step": 9505 }, { "epoch": 4.942827442827443, "grad_norm": 0.3795762062072754, "learning_rate": 4.677540049988789e-05, "loss": 0.1343, "num_input_tokens_seen": 1813712, "step": 9510 }, { "epoch": 4.9454261954261955, "grad_norm": 0.3767922520637512, "learning_rate": 4.676982782724846e-05, "loss": 0.1612, "num_input_tokens_seen": 1814608, "step": 9515 }, { "epoch": 4.948024948024948, "grad_norm": 0.9276726245880127, "learning_rate": 4.6764250676036665e-05, "loss": 0.2067, "num_input_tokens_seen": 1815536, "step": 9520 }, { "epoch": 4.950623700623701, "grad_norm": 0.5412101745605469, "learning_rate": 4.675866904739986e-05, "loss": 0.1308, "num_input_tokens_seen": 1816432, "step": 9525 }, { "epoch": 4.953222453222454, "grad_norm": 0.8623619079589844, "learning_rate": 4.675308294248633e-05, "loss": 0.1914, "num_input_tokens_seen": 1817392, "step": 9530 }, { "epoch": 4.955821205821206, "grad_norm": 0.5886671543121338, "learning_rate": 4.674749236244527e-05, "loss": 0.0866, "num_input_tokens_seen": 1818288, "step": 9535 }, { "epoch": 4.958419958419959, "grad_norm": 0.17492376267910004, "learning_rate": 4.674189730842677e-05, "loss": 0.1214, "num_input_tokens_seen": 1819344, "step": 9540 }, { "epoch": 4.961018711018711, "grad_norm": 0.8948670625686646, "learning_rate": 4.6736297781581905e-05, "loss": 0.1384, "num_input_tokens_seen": 1820272, "step": 9545 }, { "epoch": 4.963617463617464, "grad_norm": 0.4888293743133545, "learning_rate": 4.6730693783062604e-05, "loss": 0.1242, "num_input_tokens_seen": 1821232, "step": 9550 }, { "epoch": 4.966216216216216, "grad_norm": 0.5460741519927979, "learning_rate": 4.672508531402174e-05, "loss": 0.205, "num_input_tokens_seen": 1822192, "step": 9555 }, { "epoch": 4.968814968814969, "grad_norm": 0.08847492933273315, "learning_rate": 4.671947237561313e-05, "loss": 0.1049, "num_input_tokens_seen": 1823184, "step": 9560 }, { "epoch": 4.971413721413722, "grad_norm": 0.7439528703689575, "learning_rate": 4.6713854968991476e-05, "loss": 0.0823, "num_input_tokens_seen": 1824144, "step": 9565 }, { "epoch": 4.974012474012474, "grad_norm": 0.36928805708885193, "learning_rate": 4.6708233095312414e-05, "loss": 0.0877, "num_input_tokens_seen": 1825104, "step": 9570 }, { "epoch": 4.976611226611227, "grad_norm": 0.6239433884620667, "learning_rate": 4.67026067557325e-05, "loss": 0.1604, "num_input_tokens_seen": 1826000, "step": 9575 }, { "epoch": 4.979209979209979, "grad_norm": 0.36562690138816833, "learning_rate": 4.66969759514092e-05, "loss": 0.2664, "num_input_tokens_seen": 1826896, "step": 9580 }, { "epoch": 4.981808731808732, "grad_norm": 0.7050250172615051, "learning_rate": 4.669134068350091e-05, "loss": 0.1117, "num_input_tokens_seen": 1827760, "step": 9585 }, { "epoch": 4.984407484407484, "grad_norm": 0.9084662199020386, "learning_rate": 4.6685700953166934e-05, "loss": 0.1732, "num_input_tokens_seen": 1828848, "step": 9590 }, { "epoch": 4.987006237006237, "grad_norm": 0.8679859638214111, "learning_rate": 4.6680056761567506e-05, "loss": 0.1097, "num_input_tokens_seen": 1829840, "step": 9595 }, { "epoch": 4.98960498960499, "grad_norm": 0.5210300087928772, "learning_rate": 4.667440810986377e-05, "loss": 0.1526, "num_input_tokens_seen": 1830768, "step": 9600 }, { "epoch": 4.992203742203742, "grad_norm": 0.42394694685935974, "learning_rate": 4.666875499921778e-05, "loss": 0.0974, "num_input_tokens_seen": 1831696, "step": 9605 }, { "epoch": 4.994802494802495, "grad_norm": 0.431241899728775, "learning_rate": 4.666309743079252e-05, "loss": 0.1319, "num_input_tokens_seen": 1832688, "step": 9610 }, { "epoch": 4.997401247401247, "grad_norm": 0.4092947244644165, "learning_rate": 4.665743540575189e-05, "loss": 0.1437, "num_input_tokens_seen": 1833648, "step": 9615 }, { "epoch": 5.0, "grad_norm": 0.5537122488021851, "learning_rate": 4.665176892526071e-05, "loss": 0.1421, "num_input_tokens_seen": 1834568, "step": 9620 }, { "epoch": 5.0, "eval_loss": 0.15400303900241852, "eval_runtime": 8.017, "eval_samples_per_second": 106.774, "eval_steps_per_second": 26.693, "num_input_tokens_seen": 1834568, "step": 9620 }, { "epoch": 5.002598752598753, "grad_norm": 0.15881864726543427, "learning_rate": 4.664609799048469e-05, "loss": 0.1442, "num_input_tokens_seen": 1835496, "step": 9625 }, { "epoch": 5.005197505197505, "grad_norm": 0.3615124821662903, "learning_rate": 4.664042260259048e-05, "loss": 0.1886, "num_input_tokens_seen": 1836424, "step": 9630 }, { "epoch": 5.007796257796258, "grad_norm": 0.3831976652145386, "learning_rate": 4.6634742762745665e-05, "loss": 0.1075, "num_input_tokens_seen": 1837480, "step": 9635 }, { "epoch": 5.01039501039501, "grad_norm": 0.11269360035657883, "learning_rate": 4.6629058472118705e-05, "loss": 0.2434, "num_input_tokens_seen": 1838504, "step": 9640 }, { "epoch": 5.012993762993763, "grad_norm": 0.7546210289001465, "learning_rate": 4.6623369731878994e-05, "loss": 0.1642, "num_input_tokens_seen": 1839400, "step": 9645 }, { "epoch": 5.015592515592515, "grad_norm": 0.4982606768608093, "learning_rate": 4.6617676543196856e-05, "loss": 0.0867, "num_input_tokens_seen": 1840264, "step": 9650 }, { "epoch": 5.018191268191268, "grad_norm": 0.6036336421966553, "learning_rate": 4.661197890724349e-05, "loss": 0.1906, "num_input_tokens_seen": 1841160, "step": 9655 }, { "epoch": 5.020790020790021, "grad_norm": 0.23739174008369446, "learning_rate": 4.660627682519105e-05, "loss": 0.152, "num_input_tokens_seen": 1842152, "step": 9660 }, { "epoch": 5.023388773388773, "grad_norm": 0.9274232387542725, "learning_rate": 4.66005702982126e-05, "loss": 0.1224, "num_input_tokens_seen": 1843112, "step": 9665 }, { "epoch": 5.025987525987526, "grad_norm": 0.14050570130348206, "learning_rate": 4.659485932748209e-05, "loss": 0.1203, "num_input_tokens_seen": 1844072, "step": 9670 }, { "epoch": 5.028586278586278, "grad_norm": 0.408308207988739, "learning_rate": 4.658914391417442e-05, "loss": 0.129, "num_input_tokens_seen": 1845032, "step": 9675 }, { "epoch": 5.031185031185031, "grad_norm": 0.8804634809494019, "learning_rate": 4.6583424059465376e-05, "loss": 0.1797, "num_input_tokens_seen": 1845992, "step": 9680 }, { "epoch": 5.033783783783784, "grad_norm": 1.358305811882019, "learning_rate": 4.657769976453167e-05, "loss": 0.1599, "num_input_tokens_seen": 1846952, "step": 9685 }, { "epoch": 5.036382536382536, "grad_norm": 0.27119579911231995, "learning_rate": 4.657197103055092e-05, "loss": 0.137, "num_input_tokens_seen": 1847912, "step": 9690 }, { "epoch": 5.038981288981289, "grad_norm": 0.23163284361362457, "learning_rate": 4.656623785870167e-05, "loss": 0.1726, "num_input_tokens_seen": 1848936, "step": 9695 }, { "epoch": 5.041580041580041, "grad_norm": 0.11253142356872559, "learning_rate": 4.656050025016338e-05, "loss": 0.193, "num_input_tokens_seen": 1849832, "step": 9700 }, { "epoch": 5.044178794178794, "grad_norm": 0.6612008213996887, "learning_rate": 4.655475820611639e-05, "loss": 0.0551, "num_input_tokens_seen": 1850824, "step": 9705 }, { "epoch": 5.046777546777546, "grad_norm": 0.46220481395721436, "learning_rate": 4.6549011727742e-05, "loss": 0.1872, "num_input_tokens_seen": 1851816, "step": 9710 }, { "epoch": 5.049376299376299, "grad_norm": 1.2953776121139526, "learning_rate": 4.654326081622238e-05, "loss": 0.1892, "num_input_tokens_seen": 1852776, "step": 9715 }, { "epoch": 5.051975051975052, "grad_norm": 0.6233323812484741, "learning_rate": 4.653750547274063e-05, "loss": 0.1221, "num_input_tokens_seen": 1853672, "step": 9720 }, { "epoch": 5.0545738045738045, "grad_norm": 0.9191915988922119, "learning_rate": 4.653174569848077e-05, "loss": 0.3097, "num_input_tokens_seen": 1854632, "step": 9725 }, { "epoch": 5.057172557172557, "grad_norm": 0.35358157753944397, "learning_rate": 4.6525981494627725e-05, "loss": 0.0775, "num_input_tokens_seen": 1855592, "step": 9730 }, { "epoch": 5.0597713097713095, "grad_norm": 0.36466163396835327, "learning_rate": 4.6520212862367315e-05, "loss": 0.0542, "num_input_tokens_seen": 1856456, "step": 9735 }, { "epoch": 5.0623700623700625, "grad_norm": 0.18147526681423187, "learning_rate": 4.65144398028863e-05, "loss": 0.1609, "num_input_tokens_seen": 1857352, "step": 9740 }, { "epoch": 5.064968814968815, "grad_norm": 0.6800925135612488, "learning_rate": 4.650866231737232e-05, "loss": 0.1382, "num_input_tokens_seen": 1858312, "step": 9745 }, { "epoch": 5.0675675675675675, "grad_norm": 0.6526462435722351, "learning_rate": 4.650288040701396e-05, "loss": 0.1, "num_input_tokens_seen": 1859176, "step": 9750 }, { "epoch": 5.0701663201663205, "grad_norm": 0.4983646869659424, "learning_rate": 4.649709407300068e-05, "loss": 0.1615, "num_input_tokens_seen": 1860200, "step": 9755 }, { "epoch": 5.072765072765073, "grad_norm": 0.4335705041885376, "learning_rate": 4.649130331652287e-05, "loss": 0.0868, "num_input_tokens_seen": 1861096, "step": 9760 }, { "epoch": 5.075363825363826, "grad_norm": 0.7119868993759155, "learning_rate": 4.6485508138771836e-05, "loss": 0.1858, "num_input_tokens_seen": 1861992, "step": 9765 }, { "epoch": 5.077962577962578, "grad_norm": 0.4630132019519806, "learning_rate": 4.647970854093979e-05, "loss": 0.1955, "num_input_tokens_seen": 1863016, "step": 9770 }, { "epoch": 5.080561330561331, "grad_norm": 0.5923839211463928, "learning_rate": 4.647390452421983e-05, "loss": 0.1546, "num_input_tokens_seen": 1863976, "step": 9775 }, { "epoch": 5.083160083160083, "grad_norm": 0.6825656890869141, "learning_rate": 4.6468096089805976e-05, "loss": 0.1946, "num_input_tokens_seen": 1864904, "step": 9780 }, { "epoch": 5.085758835758836, "grad_norm": 0.22624056041240692, "learning_rate": 4.646228323889318e-05, "loss": 0.1546, "num_input_tokens_seen": 1865896, "step": 9785 }, { "epoch": 5.088357588357589, "grad_norm": 0.8711597919464111, "learning_rate": 4.645646597267728e-05, "loss": 0.0975, "num_input_tokens_seen": 1866856, "step": 9790 }, { "epoch": 5.090956340956341, "grad_norm": 0.7641550302505493, "learning_rate": 4.645064429235502e-05, "loss": 0.1084, "num_input_tokens_seen": 1867752, "step": 9795 }, { "epoch": 5.093555093555094, "grad_norm": 1.0206044912338257, "learning_rate": 4.644481819912405e-05, "loss": 0.1451, "num_input_tokens_seen": 1868712, "step": 9800 }, { "epoch": 5.096153846153846, "grad_norm": 0.3841920793056488, "learning_rate": 4.6438987694182957e-05, "loss": 0.2169, "num_input_tokens_seen": 1869704, "step": 9805 }, { "epoch": 5.098752598752599, "grad_norm": 0.7785800099372864, "learning_rate": 4.64331527787312e-05, "loss": 0.1075, "num_input_tokens_seen": 1870664, "step": 9810 }, { "epoch": 5.101351351351352, "grad_norm": 0.7092668414115906, "learning_rate": 4.6427313453969176e-05, "loss": 0.1861, "num_input_tokens_seen": 1871560, "step": 9815 }, { "epoch": 5.103950103950104, "grad_norm": 0.32947054505348206, "learning_rate": 4.642146972109815e-05, "loss": 0.0852, "num_input_tokens_seen": 1872552, "step": 9820 }, { "epoch": 5.106548856548857, "grad_norm": 0.6605370044708252, "learning_rate": 4.641562158132034e-05, "loss": 0.1718, "num_input_tokens_seen": 1873448, "step": 9825 }, { "epoch": 5.109147609147609, "grad_norm": 0.463339239358902, "learning_rate": 4.640976903583884e-05, "loss": 0.0885, "num_input_tokens_seen": 1874344, "step": 9830 }, { "epoch": 5.111746361746362, "grad_norm": 0.23847681283950806, "learning_rate": 4.6403912085857635e-05, "loss": 0.0957, "num_input_tokens_seen": 1875336, "step": 9835 }, { "epoch": 5.114345114345114, "grad_norm": 0.5323585271835327, "learning_rate": 4.6398050732581675e-05, "loss": 0.1168, "num_input_tokens_seen": 1876232, "step": 9840 }, { "epoch": 5.116943866943867, "grad_norm": 0.5282382965087891, "learning_rate": 4.6392184977216764e-05, "loss": 0.1858, "num_input_tokens_seen": 1877224, "step": 9845 }, { "epoch": 5.11954261954262, "grad_norm": 0.4691373109817505, "learning_rate": 4.638631482096962e-05, "loss": 0.2569, "num_input_tokens_seen": 1878248, "step": 9850 }, { "epoch": 5.122141372141372, "grad_norm": 0.5439823269844055, "learning_rate": 4.638044026504789e-05, "loss": 0.149, "num_input_tokens_seen": 1879208, "step": 9855 }, { "epoch": 5.124740124740125, "grad_norm": 1.6883114576339722, "learning_rate": 4.6374561310660096e-05, "loss": 0.1791, "num_input_tokens_seen": 1880200, "step": 9860 }, { "epoch": 5.127338877338877, "grad_norm": 0.760176420211792, "learning_rate": 4.6368677959015696e-05, "loss": 0.0506, "num_input_tokens_seen": 1881128, "step": 9865 }, { "epoch": 5.12993762993763, "grad_norm": 0.46416544914245605, "learning_rate": 4.636279021132501e-05, "loss": 0.0929, "num_input_tokens_seen": 1882056, "step": 9870 }, { "epoch": 5.132536382536382, "grad_norm": 0.07421161234378815, "learning_rate": 4.6356898068799314e-05, "loss": 0.1197, "num_input_tokens_seen": 1883048, "step": 9875 }, { "epoch": 5.135135135135135, "grad_norm": 0.25913751125335693, "learning_rate": 4.6351001532650746e-05, "loss": 0.2174, "num_input_tokens_seen": 1884008, "step": 9880 }, { "epoch": 5.137733887733888, "grad_norm": 0.4805707633495331, "learning_rate": 4.6345100604092376e-05, "loss": 0.1044, "num_input_tokens_seen": 1884936, "step": 9885 }, { "epoch": 5.14033264033264, "grad_norm": 0.49370670318603516, "learning_rate": 4.6339195284338144e-05, "loss": 0.0618, "num_input_tokens_seen": 1885896, "step": 9890 }, { "epoch": 5.142931392931393, "grad_norm": 0.3896968364715576, "learning_rate": 4.6333285574602945e-05, "loss": 0.1252, "num_input_tokens_seen": 1886856, "step": 9895 }, { "epoch": 5.145530145530145, "grad_norm": 1.124951720237732, "learning_rate": 4.632737147610252e-05, "loss": 0.1636, "num_input_tokens_seen": 1887816, "step": 9900 }, { "epoch": 5.148128898128898, "grad_norm": 0.17548398673534393, "learning_rate": 4.632145299005356e-05, "loss": 0.0741, "num_input_tokens_seen": 1888680, "step": 9905 }, { "epoch": 5.150727650727651, "grad_norm": 0.1748654842376709, "learning_rate": 4.631553011767363e-05, "loss": 0.35, "num_input_tokens_seen": 1889640, "step": 9910 }, { "epoch": 5.153326403326403, "grad_norm": 0.8821136951446533, "learning_rate": 4.63096028601812e-05, "loss": 0.1281, "num_input_tokens_seen": 1890568, "step": 9915 }, { "epoch": 5.155925155925156, "grad_norm": 0.4256429672241211, "learning_rate": 4.630367121879566e-05, "loss": 0.1429, "num_input_tokens_seen": 1891560, "step": 9920 }, { "epoch": 5.158523908523908, "grad_norm": 0.15075083076953888, "learning_rate": 4.629773519473728e-05, "loss": 0.088, "num_input_tokens_seen": 1892488, "step": 9925 }, { "epoch": 5.161122661122661, "grad_norm": 0.2739560604095459, "learning_rate": 4.629179478922725e-05, "loss": 0.0934, "num_input_tokens_seen": 1893384, "step": 9930 }, { "epoch": 5.163721413721413, "grad_norm": 0.4994238018989563, "learning_rate": 4.628585000348765e-05, "loss": 0.1284, "num_input_tokens_seen": 1894312, "step": 9935 }, { "epoch": 5.166320166320166, "grad_norm": 0.29331663250923157, "learning_rate": 4.627990083874146e-05, "loss": 0.0541, "num_input_tokens_seen": 1895208, "step": 9940 }, { "epoch": 5.168918918918919, "grad_norm": 1.0414018630981445, "learning_rate": 4.6273947296212564e-05, "loss": 0.1394, "num_input_tokens_seen": 1896104, "step": 9945 }, { "epoch": 5.171517671517671, "grad_norm": 0.5229169726371765, "learning_rate": 4.626798937712576e-05, "loss": 0.0928, "num_input_tokens_seen": 1897000, "step": 9950 }, { "epoch": 5.174116424116424, "grad_norm": 0.47315022349357605, "learning_rate": 4.626202708270672e-05, "loss": 0.12, "num_input_tokens_seen": 1897992, "step": 9955 }, { "epoch": 5.1767151767151764, "grad_norm": 0.2798098921775818, "learning_rate": 4.625606041418203e-05, "loss": 0.1074, "num_input_tokens_seen": 1898856, "step": 9960 }, { "epoch": 5.179313929313929, "grad_norm": 1.0898576974868774, "learning_rate": 4.6250089372779196e-05, "loss": 0.1166, "num_input_tokens_seen": 1899784, "step": 9965 }, { "epoch": 5.1819126819126815, "grad_norm": 0.4608388841152191, "learning_rate": 4.624411395972659e-05, "loss": 0.0491, "num_input_tokens_seen": 1900712, "step": 9970 }, { "epoch": 5.1845114345114345, "grad_norm": 0.7805703282356262, "learning_rate": 4.623813417625348e-05, "loss": 0.1232, "num_input_tokens_seen": 1901672, "step": 9975 }, { "epoch": 5.1871101871101875, "grad_norm": 0.24038617312908173, "learning_rate": 4.6232150023590075e-05, "loss": 0.0928, "num_input_tokens_seen": 1902600, "step": 9980 }, { "epoch": 5.1897089397089395, "grad_norm": 0.4669181704521179, "learning_rate": 4.622616150296745e-05, "loss": 0.0986, "num_input_tokens_seen": 1903560, "step": 9985 }, { "epoch": 5.1923076923076925, "grad_norm": 0.8333826661109924, "learning_rate": 4.6220168615617584e-05, "loss": 0.1765, "num_input_tokens_seen": 1904456, "step": 9990 }, { "epoch": 5.194906444906445, "grad_norm": 1.4139373302459717, "learning_rate": 4.621417136277335e-05, "loss": 0.0869, "num_input_tokens_seen": 1905416, "step": 9995 }, { "epoch": 5.197505197505198, "grad_norm": 0.6238323450088501, "learning_rate": 4.6208169745668546e-05, "loss": 0.0848, "num_input_tokens_seen": 1906344, "step": 10000 }, { "epoch": 5.20010395010395, "grad_norm": 1.4953361749649048, "learning_rate": 4.620216376553783e-05, "loss": 0.2543, "num_input_tokens_seen": 1907304, "step": 10005 }, { "epoch": 5.202702702702703, "grad_norm": 0.10702339559793472, "learning_rate": 4.619615342361678e-05, "loss": 0.1177, "num_input_tokens_seen": 1908328, "step": 10010 }, { "epoch": 5.205301455301456, "grad_norm": 0.71174156665802, "learning_rate": 4.619013872114186e-05, "loss": 0.2281, "num_input_tokens_seen": 1909288, "step": 10015 }, { "epoch": 5.207900207900208, "grad_norm": 0.3504315912723541, "learning_rate": 4.618411965935045e-05, "loss": 0.1128, "num_input_tokens_seen": 1910216, "step": 10020 }, { "epoch": 5.210498960498961, "grad_norm": 0.7544816732406616, "learning_rate": 4.617809623948082e-05, "loss": 0.1638, "num_input_tokens_seen": 1911144, "step": 10025 }, { "epoch": 5.213097713097713, "grad_norm": 0.4364742338657379, "learning_rate": 4.617206846277211e-05, "loss": 0.115, "num_input_tokens_seen": 1912136, "step": 10030 }, { "epoch": 5.215696465696466, "grad_norm": 1.1362169981002808, "learning_rate": 4.6166036330464385e-05, "loss": 0.083, "num_input_tokens_seen": 1913032, "step": 10035 }, { "epoch": 5.218295218295219, "grad_norm": 1.1798741817474365, "learning_rate": 4.6159999843798605e-05, "loss": 0.2102, "num_input_tokens_seen": 1913928, "step": 10040 }, { "epoch": 5.220893970893971, "grad_norm": 0.5599799156188965, "learning_rate": 4.615395900401661e-05, "loss": 0.1771, "num_input_tokens_seen": 1914920, "step": 10045 }, { "epoch": 5.223492723492724, "grad_norm": 0.7319332957267761, "learning_rate": 4.614791381236115e-05, "loss": 0.2164, "num_input_tokens_seen": 1915944, "step": 10050 }, { "epoch": 5.226091476091476, "grad_norm": 0.5378111600875854, "learning_rate": 4.6141864270075873e-05, "loss": 0.2213, "num_input_tokens_seen": 1916872, "step": 10055 }, { "epoch": 5.228690228690229, "grad_norm": 1.0282436609268188, "learning_rate": 4.61358103784053e-05, "loss": 0.1303, "num_input_tokens_seen": 1917768, "step": 10060 }, { "epoch": 5.231288981288981, "grad_norm": 0.8773472309112549, "learning_rate": 4.6129752138594874e-05, "loss": 0.1036, "num_input_tokens_seen": 1918632, "step": 10065 }, { "epoch": 5.233887733887734, "grad_norm": 1.2477905750274658, "learning_rate": 4.61236895518909e-05, "loss": 0.1977, "num_input_tokens_seen": 1919560, "step": 10070 }, { "epoch": 5.236486486486487, "grad_norm": 0.8985896110534668, "learning_rate": 4.611762261954062e-05, "loss": 0.2167, "num_input_tokens_seen": 1920520, "step": 10075 }, { "epoch": 5.239085239085239, "grad_norm": 0.481510192155838, "learning_rate": 4.6111551342792126e-05, "loss": 0.0988, "num_input_tokens_seen": 1921576, "step": 10080 }, { "epoch": 5.241683991683992, "grad_norm": 0.46717798709869385, "learning_rate": 4.6105475722894434e-05, "loss": 0.0802, "num_input_tokens_seen": 1922472, "step": 10085 }, { "epoch": 5.244282744282744, "grad_norm": 1.1250535249710083, "learning_rate": 4.6099395761097444e-05, "loss": 0.2878, "num_input_tokens_seen": 1923432, "step": 10090 }, { "epoch": 5.246881496881497, "grad_norm": 1.0821844339370728, "learning_rate": 4.609331145865195e-05, "loss": 0.1925, "num_input_tokens_seen": 1924392, "step": 10095 }, { "epoch": 5.24948024948025, "grad_norm": 0.6799405217170715, "learning_rate": 4.6087222816809634e-05, "loss": 0.1593, "num_input_tokens_seen": 1925416, "step": 10100 }, { "epoch": 5.252079002079002, "grad_norm": 0.6813819408416748, "learning_rate": 4.608112983682308e-05, "loss": 0.2169, "num_input_tokens_seen": 1926376, "step": 10105 }, { "epoch": 5.254677754677755, "grad_norm": 0.4533242881298065, "learning_rate": 4.607503251994575e-05, "loss": 0.1275, "num_input_tokens_seen": 1927272, "step": 10110 }, { "epoch": 5.257276507276507, "grad_norm": 0.24367183446884155, "learning_rate": 4.606893086743201e-05, "loss": 0.118, "num_input_tokens_seen": 1928200, "step": 10115 }, { "epoch": 5.25987525987526, "grad_norm": 0.11410727351903915, "learning_rate": 4.606282488053713e-05, "loss": 0.1733, "num_input_tokens_seen": 1929160, "step": 10120 }, { "epoch": 5.262474012474012, "grad_norm": 1.0988218784332275, "learning_rate": 4.605671456051724e-05, "loss": 0.1681, "num_input_tokens_seen": 1930120, "step": 10125 }, { "epoch": 5.265072765072765, "grad_norm": 0.5290773510932922, "learning_rate": 4.6050599908629386e-05, "loss": 0.1862, "num_input_tokens_seen": 1931144, "step": 10130 }, { "epoch": 5.267671517671518, "grad_norm": 0.7478612661361694, "learning_rate": 4.6044480926131494e-05, "loss": 0.2188, "num_input_tokens_seen": 1932072, "step": 10135 }, { "epoch": 5.27027027027027, "grad_norm": 0.34276553988456726, "learning_rate": 4.6038357614282387e-05, "loss": 0.3228, "num_input_tokens_seen": 1933064, "step": 10140 }, { "epoch": 5.272869022869023, "grad_norm": 0.5331087708473206, "learning_rate": 4.603222997434177e-05, "loss": 0.0791, "num_input_tokens_seen": 1933928, "step": 10145 }, { "epoch": 5.275467775467775, "grad_norm": 0.32532215118408203, "learning_rate": 4.6026098007570256e-05, "loss": 0.0881, "num_input_tokens_seen": 1934888, "step": 10150 }, { "epoch": 5.278066528066528, "grad_norm": 0.562980592250824, "learning_rate": 4.601996171522933e-05, "loss": 0.152, "num_input_tokens_seen": 1935784, "step": 10155 }, { "epoch": 5.28066528066528, "grad_norm": 0.7492615580558777, "learning_rate": 4.6013821098581375e-05, "loss": 0.1555, "num_input_tokens_seen": 1936776, "step": 10160 }, { "epoch": 5.283264033264033, "grad_norm": 0.6586666703224182, "learning_rate": 4.6007676158889657e-05, "loss": 0.1488, "num_input_tokens_seen": 1937768, "step": 10165 }, { "epoch": 5.285862785862786, "grad_norm": 0.8315315246582031, "learning_rate": 4.6001526897418344e-05, "loss": 0.1798, "num_input_tokens_seen": 1938696, "step": 10170 }, { "epoch": 5.288461538461538, "grad_norm": 1.871394157409668, "learning_rate": 4.599537331543248e-05, "loss": 0.2246, "num_input_tokens_seen": 1939624, "step": 10175 }, { "epoch": 5.291060291060291, "grad_norm": 0.15228499472141266, "learning_rate": 4.5989215414198004e-05, "loss": 0.1336, "num_input_tokens_seen": 1940648, "step": 10180 }, { "epoch": 5.293659043659043, "grad_norm": 0.571860134601593, "learning_rate": 4.598305319498175e-05, "loss": 0.1359, "num_input_tokens_seen": 1941704, "step": 10185 }, { "epoch": 5.296257796257796, "grad_norm": 0.31731581687927246, "learning_rate": 4.597688665905143e-05, "loss": 0.2474, "num_input_tokens_seen": 1942696, "step": 10190 }, { "epoch": 5.298856548856548, "grad_norm": 0.15100562572479248, "learning_rate": 4.597071580767565e-05, "loss": 0.0852, "num_input_tokens_seen": 1943688, "step": 10195 }, { "epoch": 5.301455301455301, "grad_norm": 0.39250901341438293, "learning_rate": 4.596454064212388e-05, "loss": 0.2546, "num_input_tokens_seen": 1944584, "step": 10200 }, { "epoch": 5.304054054054054, "grad_norm": 2.0276567935943604, "learning_rate": 4.595836116366653e-05, "loss": 0.1069, "num_input_tokens_seen": 1945576, "step": 10205 }, { "epoch": 5.3066528066528065, "grad_norm": 0.9165298938751221, "learning_rate": 4.595217737357484e-05, "loss": 0.1158, "num_input_tokens_seen": 1946632, "step": 10210 }, { "epoch": 5.3092515592515594, "grad_norm": 0.2245965152978897, "learning_rate": 4.594598927312098e-05, "loss": 0.1064, "num_input_tokens_seen": 1947560, "step": 10215 }, { "epoch": 5.3118503118503115, "grad_norm": 0.5703166127204895, "learning_rate": 4.593979686357799e-05, "loss": 0.1412, "num_input_tokens_seen": 1948520, "step": 10220 }, { "epoch": 5.3144490644490645, "grad_norm": 0.1679840087890625, "learning_rate": 4.593360014621978e-05, "loss": 0.11, "num_input_tokens_seen": 1949448, "step": 10225 }, { "epoch": 5.317047817047817, "grad_norm": 0.8346087336540222, "learning_rate": 4.592739912232118e-05, "loss": 0.2623, "num_input_tokens_seen": 1950376, "step": 10230 }, { "epoch": 5.31964656964657, "grad_norm": 0.5165576338768005, "learning_rate": 4.592119379315788e-05, "loss": 0.0754, "num_input_tokens_seen": 1951272, "step": 10235 }, { "epoch": 5.3222453222453225, "grad_norm": 0.4681049883365631, "learning_rate": 4.591498416000647e-05, "loss": 0.0893, "num_input_tokens_seen": 1952232, "step": 10240 }, { "epoch": 5.324844074844075, "grad_norm": 0.6667765378952026, "learning_rate": 4.59087702241444e-05, "loss": 0.1599, "num_input_tokens_seen": 1953224, "step": 10245 }, { "epoch": 5.327442827442828, "grad_norm": 0.7833064198493958, "learning_rate": 4.5902551986850056e-05, "loss": 0.134, "num_input_tokens_seen": 1954152, "step": 10250 }, { "epoch": 5.33004158004158, "grad_norm": 0.4975820481777191, "learning_rate": 4.589632944940265e-05, "loss": 0.1069, "num_input_tokens_seen": 1955080, "step": 10255 }, { "epoch": 5.332640332640333, "grad_norm": 0.9518198370933533, "learning_rate": 4.589010261308233e-05, "loss": 0.1959, "num_input_tokens_seen": 1955976, "step": 10260 }, { "epoch": 5.335239085239086, "grad_norm": 0.7918531894683838, "learning_rate": 4.588387147917008e-05, "loss": 0.0986, "num_input_tokens_seen": 1956904, "step": 10265 }, { "epoch": 5.337837837837838, "grad_norm": 0.671206533908844, "learning_rate": 4.5877636048947806e-05, "loss": 0.1716, "num_input_tokens_seen": 1957864, "step": 10270 }, { "epoch": 5.340436590436591, "grad_norm": 0.6062809824943542, "learning_rate": 4.587139632369829e-05, "loss": 0.1238, "num_input_tokens_seen": 1958824, "step": 10275 }, { "epoch": 5.343035343035343, "grad_norm": 0.77029949426651, "learning_rate": 4.586515230470518e-05, "loss": 0.1651, "num_input_tokens_seen": 1959816, "step": 10280 }, { "epoch": 5.345634095634096, "grad_norm": 1.579572319984436, "learning_rate": 4.585890399325302e-05, "loss": 0.2265, "num_input_tokens_seen": 1960776, "step": 10285 }, { "epoch": 5.348232848232848, "grad_norm": 0.3388689458370209, "learning_rate": 4.5852651390627257e-05, "loss": 0.14, "num_input_tokens_seen": 1961672, "step": 10290 }, { "epoch": 5.350831600831601, "grad_norm": 0.26376479864120483, "learning_rate": 4.584639449811417e-05, "loss": 0.1039, "num_input_tokens_seen": 1962600, "step": 10295 }, { "epoch": 5.353430353430354, "grad_norm": 0.5619572401046753, "learning_rate": 4.5840133317000975e-05, "loss": 0.1108, "num_input_tokens_seen": 1963496, "step": 10300 }, { "epoch": 5.356029106029106, "grad_norm": 1.192718505859375, "learning_rate": 4.583386784857573e-05, "loss": 0.231, "num_input_tokens_seen": 1964424, "step": 10305 }, { "epoch": 5.358627858627859, "grad_norm": 0.5401699542999268, "learning_rate": 4.58275980941274e-05, "loss": 0.1506, "num_input_tokens_seen": 1965544, "step": 10310 }, { "epoch": 5.361226611226611, "grad_norm": 0.7449181079864502, "learning_rate": 4.582132405494582e-05, "loss": 0.2064, "num_input_tokens_seen": 1966504, "step": 10315 }, { "epoch": 5.363825363825364, "grad_norm": 0.7060927748680115, "learning_rate": 4.581504573232171e-05, "loss": 0.1163, "num_input_tokens_seen": 1967528, "step": 10320 }, { "epoch": 5.366424116424117, "grad_norm": 0.39337578415870667, "learning_rate": 4.580876312754668e-05, "loss": 0.0945, "num_input_tokens_seen": 1968552, "step": 10325 }, { "epoch": 5.369022869022869, "grad_norm": 0.45838677883148193, "learning_rate": 4.5802476241913194e-05, "loss": 0.247, "num_input_tokens_seen": 1969480, "step": 10330 }, { "epoch": 5.371621621621622, "grad_norm": 0.9879007935523987, "learning_rate": 4.579618507671463e-05, "loss": 0.2846, "num_input_tokens_seen": 1970408, "step": 10335 }, { "epoch": 5.374220374220374, "grad_norm": 0.8898592591285706, "learning_rate": 4.5789889633245216e-05, "loss": 0.1505, "num_input_tokens_seen": 1971304, "step": 10340 }, { "epoch": 5.376819126819127, "grad_norm": 0.23695331811904907, "learning_rate": 4.5783589912800085e-05, "loss": 0.1283, "num_input_tokens_seen": 1972232, "step": 10345 }, { "epoch": 5.379417879417879, "grad_norm": 0.1547158807516098, "learning_rate": 4.577728591667523e-05, "loss": 0.2763, "num_input_tokens_seen": 1973192, "step": 10350 }, { "epoch": 5.382016632016632, "grad_norm": 1.605004906654358, "learning_rate": 4.5770977646167544e-05, "loss": 0.263, "num_input_tokens_seen": 1974184, "step": 10355 }, { "epoch": 5.384615384615385, "grad_norm": 0.6783281564712524, "learning_rate": 4.5764665102574785e-05, "loss": 0.1607, "num_input_tokens_seen": 1975144, "step": 10360 }, { "epoch": 5.387214137214137, "grad_norm": 0.4417858123779297, "learning_rate": 4.57583482871956e-05, "loss": 0.0563, "num_input_tokens_seen": 1976072, "step": 10365 }, { "epoch": 5.38981288981289, "grad_norm": 1.1285104751586914, "learning_rate": 4.575202720132949e-05, "loss": 0.2365, "num_input_tokens_seen": 1977032, "step": 10370 }, { "epoch": 5.392411642411642, "grad_norm": 1.0765622854232788, "learning_rate": 4.574570184627688e-05, "loss": 0.1629, "num_input_tokens_seen": 1977992, "step": 10375 }, { "epoch": 5.395010395010395, "grad_norm": 0.40945667028427124, "learning_rate": 4.5739372223339015e-05, "loss": 0.1281, "num_input_tokens_seen": 1978920, "step": 10380 }, { "epoch": 5.397609147609147, "grad_norm": 0.9276033639907837, "learning_rate": 4.573303833381807e-05, "loss": 0.2292, "num_input_tokens_seen": 1979976, "step": 10385 }, { "epoch": 5.4002079002079, "grad_norm": 0.34661513566970825, "learning_rate": 4.5726700179017077e-05, "loss": 0.1113, "num_input_tokens_seen": 1980936, "step": 10390 }, { "epoch": 5.402806652806653, "grad_norm": 0.2229653000831604, "learning_rate": 4.5720357760239927e-05, "loss": 0.1669, "num_input_tokens_seen": 1981832, "step": 10395 }, { "epoch": 5.405405405405405, "grad_norm": 0.6089046597480774, "learning_rate": 4.571401107879144e-05, "loss": 0.1173, "num_input_tokens_seen": 1982760, "step": 10400 }, { "epoch": 5.408004158004158, "grad_norm": 0.34704291820526123, "learning_rate": 4.570766013597725e-05, "loss": 0.1127, "num_input_tokens_seen": 1983784, "step": 10405 }, { "epoch": 5.41060291060291, "grad_norm": 0.2572495937347412, "learning_rate": 4.57013049331039e-05, "loss": 0.1942, "num_input_tokens_seen": 1984712, "step": 10410 }, { "epoch": 5.413201663201663, "grad_norm": 0.6194714903831482, "learning_rate": 4.5694945471478825e-05, "loss": 0.1837, "num_input_tokens_seen": 1985704, "step": 10415 }, { "epoch": 5.415800415800415, "grad_norm": 0.8276395201683044, "learning_rate": 4.56885817524103e-05, "loss": 0.1449, "num_input_tokens_seen": 1986760, "step": 10420 }, { "epoch": 5.418399168399168, "grad_norm": 0.26593390107154846, "learning_rate": 4.56822137772075e-05, "loss": 0.075, "num_input_tokens_seen": 1987752, "step": 10425 }, { "epoch": 5.420997920997921, "grad_norm": 0.6630716323852539, "learning_rate": 4.567584154718048e-05, "loss": 0.1318, "num_input_tokens_seen": 1988744, "step": 10430 }, { "epoch": 5.423596673596673, "grad_norm": 0.5307022929191589, "learning_rate": 4.566946506364013e-05, "loss": 0.121, "num_input_tokens_seen": 1989672, "step": 10435 }, { "epoch": 5.426195426195426, "grad_norm": 1.3678044080734253, "learning_rate": 4.5663084327898275e-05, "loss": 0.1296, "num_input_tokens_seen": 1990664, "step": 10440 }, { "epoch": 5.4287941787941785, "grad_norm": 1.317255973815918, "learning_rate": 4.565669934126757e-05, "loss": 0.1663, "num_input_tokens_seen": 1991624, "step": 10445 }, { "epoch": 5.4313929313929314, "grad_norm": 1.013135552406311, "learning_rate": 4.565031010506157e-05, "loss": 0.3354, "num_input_tokens_seen": 1992616, "step": 10450 }, { "epoch": 5.4339916839916835, "grad_norm": 1.1717324256896973, "learning_rate": 4.5643916620594676e-05, "loss": 0.1718, "num_input_tokens_seen": 1993576, "step": 10455 }, { "epoch": 5.4365904365904365, "grad_norm": 0.2885681688785553, "learning_rate": 4.5637518889182195e-05, "loss": 0.1252, "num_input_tokens_seen": 1994504, "step": 10460 }, { "epoch": 5.4391891891891895, "grad_norm": 0.7955191731452942, "learning_rate": 4.563111691214028e-05, "loss": 0.0971, "num_input_tokens_seen": 1995496, "step": 10465 }, { "epoch": 5.441787941787942, "grad_norm": 0.40117111802101135, "learning_rate": 4.562471069078599e-05, "loss": 0.1275, "num_input_tokens_seen": 1996520, "step": 10470 }, { "epoch": 5.4443866943866945, "grad_norm": 0.4847790002822876, "learning_rate": 4.561830022643721e-05, "loss": 0.0658, "num_input_tokens_seen": 1997480, "step": 10475 }, { "epoch": 5.446985446985447, "grad_norm": 0.4030018150806427, "learning_rate": 4.561188552041274e-05, "loss": 0.2051, "num_input_tokens_seen": 1998440, "step": 10480 }, { "epoch": 5.4495841995842, "grad_norm": 0.18161050975322723, "learning_rate": 4.560546657403225e-05, "loss": 0.0667, "num_input_tokens_seen": 1999368, "step": 10485 }, { "epoch": 5.452182952182953, "grad_norm": 1.0294221639633179, "learning_rate": 4.5599043388616256e-05, "loss": 0.1899, "num_input_tokens_seen": 2000328, "step": 10490 }, { "epoch": 5.454781704781705, "grad_norm": 1.1828713417053223, "learning_rate": 4.559261596548616e-05, "loss": 0.1444, "num_input_tokens_seen": 2001352, "step": 10495 }, { "epoch": 5.457380457380458, "grad_norm": 0.13042408227920532, "learning_rate": 4.558618430596424e-05, "loss": 0.0929, "num_input_tokens_seen": 2002248, "step": 10500 }, { "epoch": 5.45997920997921, "grad_norm": 0.14248886704444885, "learning_rate": 4.557974841137364e-05, "loss": 0.1145, "num_input_tokens_seen": 2003176, "step": 10505 }, { "epoch": 5.462577962577963, "grad_norm": 0.5564173460006714, "learning_rate": 4.557330828303837e-05, "loss": 0.1684, "num_input_tokens_seen": 2004168, "step": 10510 }, { "epoch": 5.465176715176715, "grad_norm": 0.2599652409553528, "learning_rate": 4.556686392228333e-05, "loss": 0.1649, "num_input_tokens_seen": 2005160, "step": 10515 }, { "epoch": 5.467775467775468, "grad_norm": 0.9242967963218689, "learning_rate": 4.556041533043427e-05, "loss": 0.0741, "num_input_tokens_seen": 2006024, "step": 10520 }, { "epoch": 5.470374220374221, "grad_norm": 0.394460529088974, "learning_rate": 4.555396250881783e-05, "loss": 0.1781, "num_input_tokens_seen": 2006920, "step": 10525 }, { "epoch": 5.472972972972973, "grad_norm": 0.5293152928352356, "learning_rate": 4.554750545876149e-05, "loss": 0.1836, "num_input_tokens_seen": 2007848, "step": 10530 }, { "epoch": 5.475571725571726, "grad_norm": 0.45080286264419556, "learning_rate": 4.554104418159364e-05, "loss": 0.1543, "num_input_tokens_seen": 2008840, "step": 10535 }, { "epoch": 5.478170478170478, "grad_norm": 0.8002799153327942, "learning_rate": 4.5534578678643505e-05, "loss": 0.134, "num_input_tokens_seen": 2009768, "step": 10540 }, { "epoch": 5.480769230769231, "grad_norm": 0.7330312132835388, "learning_rate": 4.55281089512412e-05, "loss": 0.1983, "num_input_tokens_seen": 2010824, "step": 10545 }, { "epoch": 5.483367983367984, "grad_norm": 0.1940452605485916, "learning_rate": 4.55216350007177e-05, "loss": 0.0905, "num_input_tokens_seen": 2011816, "step": 10550 }, { "epoch": 5.485966735966736, "grad_norm": 0.3712399899959564, "learning_rate": 4.551515682840485e-05, "loss": 0.1548, "num_input_tokens_seen": 2012808, "step": 10555 }, { "epoch": 5.488565488565489, "grad_norm": 0.36090967059135437, "learning_rate": 4.550867443563536e-05, "loss": 0.0692, "num_input_tokens_seen": 2013704, "step": 10560 }, { "epoch": 5.491164241164241, "grad_norm": 0.5495150089263916, "learning_rate": 4.5502187823742825e-05, "loss": 0.1347, "num_input_tokens_seen": 2014664, "step": 10565 }, { "epoch": 5.493762993762994, "grad_norm": 0.35961589217185974, "learning_rate": 4.5495696994061686e-05, "loss": 0.1445, "num_input_tokens_seen": 2015656, "step": 10570 }, { "epoch": 5.496361746361746, "grad_norm": 0.9156236052513123, "learning_rate": 4.5489201947927264e-05, "loss": 0.1263, "num_input_tokens_seen": 2016584, "step": 10575 }, { "epoch": 5.498960498960499, "grad_norm": 0.24568447470664978, "learning_rate": 4.5482702686675735e-05, "loss": 0.1283, "num_input_tokens_seen": 2017544, "step": 10580 }, { "epoch": 5.501559251559252, "grad_norm": 0.772939920425415, "learning_rate": 4.547619921164418e-05, "loss": 0.0564, "num_input_tokens_seen": 2018536, "step": 10585 }, { "epoch": 5.504158004158004, "grad_norm": 0.4226590692996979, "learning_rate": 4.546969152417048e-05, "loss": 0.1509, "num_input_tokens_seen": 2019432, "step": 10590 }, { "epoch": 5.506756756756757, "grad_norm": 0.7776605486869812, "learning_rate": 4.5463179625593454e-05, "loss": 0.2061, "num_input_tokens_seen": 2020360, "step": 10595 }, { "epoch": 5.509355509355509, "grad_norm": 0.4995742738246918, "learning_rate": 4.545666351725273e-05, "loss": 0.115, "num_input_tokens_seen": 2021288, "step": 10600 }, { "epoch": 5.511954261954262, "grad_norm": 0.49508965015411377, "learning_rate": 4.545014320048885e-05, "loss": 0.1128, "num_input_tokens_seen": 2022216, "step": 10605 }, { "epoch": 5.514553014553014, "grad_norm": 0.897853672504425, "learning_rate": 4.5443618676643185e-05, "loss": 0.117, "num_input_tokens_seen": 2023272, "step": 10610 }, { "epoch": 5.517151767151767, "grad_norm": 0.754244327545166, "learning_rate": 4.5437089947057986e-05, "loss": 0.1953, "num_input_tokens_seen": 2024200, "step": 10615 }, { "epoch": 5.51975051975052, "grad_norm": 0.061154209077358246, "learning_rate": 4.543055701307637e-05, "loss": 0.0641, "num_input_tokens_seen": 2025128, "step": 10620 }, { "epoch": 5.522349272349272, "grad_norm": 0.6198918223381042, "learning_rate": 4.542401987604232e-05, "loss": 0.1513, "num_input_tokens_seen": 2026088, "step": 10625 }, { "epoch": 5.524948024948025, "grad_norm": 0.595166802406311, "learning_rate": 4.5417478537300676e-05, "loss": 0.1954, "num_input_tokens_seen": 2027080, "step": 10630 }, { "epoch": 5.527546777546777, "grad_norm": 0.9947251081466675, "learning_rate": 4.541093299819714e-05, "loss": 0.1431, "num_input_tokens_seen": 2028168, "step": 10635 }, { "epoch": 5.53014553014553, "grad_norm": 0.4166143238544464, "learning_rate": 4.540438326007831e-05, "loss": 0.1204, "num_input_tokens_seen": 2029128, "step": 10640 }, { "epoch": 5.532744282744282, "grad_norm": 0.39495834708213806, "learning_rate": 4.53978293242916e-05, "loss": 0.151, "num_input_tokens_seen": 2030056, "step": 10645 }, { "epoch": 5.535343035343035, "grad_norm": 1.6057237386703491, "learning_rate": 4.539127119218531e-05, "loss": 0.2254, "num_input_tokens_seen": 2030952, "step": 10650 }, { "epoch": 5.537941787941788, "grad_norm": 0.4947931170463562, "learning_rate": 4.5384708865108625e-05, "loss": 0.0405, "num_input_tokens_seen": 2031816, "step": 10655 }, { "epoch": 5.54054054054054, "grad_norm": 0.22843319177627563, "learning_rate": 4.537814234441156e-05, "loss": 0.1176, "num_input_tokens_seen": 2032712, "step": 10660 }, { "epoch": 5.543139293139293, "grad_norm": 1.7532373666763306, "learning_rate": 4.5371571631444995e-05, "loss": 0.1753, "num_input_tokens_seen": 2033640, "step": 10665 }, { "epoch": 5.545738045738045, "grad_norm": 0.611322820186615, "learning_rate": 4.536499672756069e-05, "loss": 0.1669, "num_input_tokens_seen": 2034632, "step": 10670 }, { "epoch": 5.548336798336798, "grad_norm": 0.4182029366493225, "learning_rate": 4.535841763411127e-05, "loss": 0.1321, "num_input_tokens_seen": 2035592, "step": 10675 }, { "epoch": 5.5509355509355505, "grad_norm": 0.3540584444999695, "learning_rate": 4.53518343524502e-05, "loss": 0.2167, "num_input_tokens_seen": 2036680, "step": 10680 }, { "epoch": 5.553534303534303, "grad_norm": 1.2798994779586792, "learning_rate": 4.5345246883931817e-05, "loss": 0.2026, "num_input_tokens_seen": 2037672, "step": 10685 }, { "epoch": 5.556133056133056, "grad_norm": 0.08250773698091507, "learning_rate": 4.533865522991132e-05, "loss": 0.1275, "num_input_tokens_seen": 2038600, "step": 10690 }, { "epoch": 5.5587318087318085, "grad_norm": 0.6623890995979309, "learning_rate": 4.533205939174478e-05, "loss": 0.1007, "num_input_tokens_seen": 2039528, "step": 10695 }, { "epoch": 5.5613305613305615, "grad_norm": 0.13558761775493622, "learning_rate": 4.532545937078911e-05, "loss": 0.0509, "num_input_tokens_seen": 2040520, "step": 10700 }, { "epoch": 5.563929313929314, "grad_norm": 0.8250835537910461, "learning_rate": 4.531885516840209e-05, "loss": 0.1619, "num_input_tokens_seen": 2041512, "step": 10705 }, { "epoch": 5.5665280665280665, "grad_norm": 0.44240808486938477, "learning_rate": 4.531224678594237e-05, "loss": 0.0594, "num_input_tokens_seen": 2042472, "step": 10710 }, { "epoch": 5.5691268191268195, "grad_norm": 0.6214687824249268, "learning_rate": 4.530563422476944e-05, "loss": 0.2959, "num_input_tokens_seen": 2043368, "step": 10715 }, { "epoch": 5.571725571725572, "grad_norm": 0.36888253688812256, "learning_rate": 4.529901748624367e-05, "loss": 0.1335, "num_input_tokens_seen": 2044424, "step": 10720 }, { "epoch": 5.574324324324325, "grad_norm": 0.2061328887939453, "learning_rate": 4.529239657172627e-05, "loss": 0.1265, "num_input_tokens_seen": 2045384, "step": 10725 }, { "epoch": 5.576923076923077, "grad_norm": 0.09766481071710587, "learning_rate": 4.5285771482579336e-05, "loss": 0.0903, "num_input_tokens_seen": 2046344, "step": 10730 }, { "epoch": 5.57952182952183, "grad_norm": 0.599465548992157, "learning_rate": 4.527914222016579e-05, "loss": 0.0754, "num_input_tokens_seen": 2047368, "step": 10735 }, { "epoch": 5.582120582120583, "grad_norm": 0.7888445258140564, "learning_rate": 4.527250878584945e-05, "loss": 0.1278, "num_input_tokens_seen": 2048296, "step": 10740 }, { "epoch": 5.584719334719335, "grad_norm": 0.09671462327241898, "learning_rate": 4.526587118099496e-05, "loss": 0.1067, "num_input_tokens_seen": 2049288, "step": 10745 }, { "epoch": 5.587318087318088, "grad_norm": 1.0159300565719604, "learning_rate": 4.525922940696782e-05, "loss": 0.2416, "num_input_tokens_seen": 2050248, "step": 10750 }, { "epoch": 5.58991683991684, "grad_norm": 0.8033612370491028, "learning_rate": 4.525258346513442e-05, "loss": 0.12, "num_input_tokens_seen": 2051176, "step": 10755 }, { "epoch": 5.592515592515593, "grad_norm": 0.22575844824314117, "learning_rate": 4.524593335686199e-05, "loss": 0.1163, "num_input_tokens_seen": 2052168, "step": 10760 }, { "epoch": 5.595114345114345, "grad_norm": 0.3183082938194275, "learning_rate": 4.52392790835186e-05, "loss": 0.0936, "num_input_tokens_seen": 2053096, "step": 10765 }, { "epoch": 5.597713097713098, "grad_norm": 2.057158946990967, "learning_rate": 4.5232620646473204e-05, "loss": 0.1584, "num_input_tokens_seen": 2054024, "step": 10770 }, { "epoch": 5.600311850311851, "grad_norm": 0.09515754133462906, "learning_rate": 4.5225958047095596e-05, "loss": 0.1726, "num_input_tokens_seen": 2054984, "step": 10775 }, { "epoch": 5.602910602910603, "grad_norm": 0.18517684936523438, "learning_rate": 4.5219291286756436e-05, "loss": 0.0907, "num_input_tokens_seen": 2055880, "step": 10780 }, { "epoch": 5.605509355509356, "grad_norm": 0.2326105237007141, "learning_rate": 4.521262036682724e-05, "loss": 0.0996, "num_input_tokens_seen": 2056840, "step": 10785 }, { "epoch": 5.608108108108108, "grad_norm": 0.1848461627960205, "learning_rate": 4.520594528868036e-05, "loss": 0.0953, "num_input_tokens_seen": 2057800, "step": 10790 }, { "epoch": 5.610706860706861, "grad_norm": 1.482040286064148, "learning_rate": 4.519926605368903e-05, "loss": 0.1657, "num_input_tokens_seen": 2058856, "step": 10795 }, { "epoch": 5.613305613305613, "grad_norm": 0.5389889478683472, "learning_rate": 4.519258266322732e-05, "loss": 0.1108, "num_input_tokens_seen": 2059848, "step": 10800 }, { "epoch": 5.615904365904366, "grad_norm": 0.24826790392398834, "learning_rate": 4.518589511867017e-05, "loss": 0.0639, "num_input_tokens_seen": 2060776, "step": 10805 }, { "epoch": 5.618503118503119, "grad_norm": 0.23848110437393188, "learning_rate": 4.5179203421393366e-05, "loss": 0.2318, "num_input_tokens_seen": 2061640, "step": 10810 }, { "epoch": 5.621101871101871, "grad_norm": 0.47672346234321594, "learning_rate": 4.5172507572773546e-05, "loss": 0.1452, "num_input_tokens_seen": 2062600, "step": 10815 }, { "epoch": 5.623700623700624, "grad_norm": 0.31293001770973206, "learning_rate": 4.516580757418821e-05, "loss": 0.1094, "num_input_tokens_seen": 2063560, "step": 10820 }, { "epoch": 5.626299376299376, "grad_norm": 0.1309116929769516, "learning_rate": 4.515910342701571e-05, "loss": 0.2057, "num_input_tokens_seen": 2064488, "step": 10825 }, { "epoch": 5.628898128898129, "grad_norm": 0.5177907943725586, "learning_rate": 4.515239513263524e-05, "loss": 0.078, "num_input_tokens_seen": 2065384, "step": 10830 }, { "epoch": 5.631496881496881, "grad_norm": 0.4763738512992859, "learning_rate": 4.5145682692426864e-05, "loss": 0.1482, "num_input_tokens_seen": 2066376, "step": 10835 }, { "epoch": 5.634095634095634, "grad_norm": 0.6183321475982666, "learning_rate": 4.5138966107771474e-05, "loss": 0.1551, "num_input_tokens_seen": 2067432, "step": 10840 }, { "epoch": 5.636694386694387, "grad_norm": 0.19913199543952942, "learning_rate": 4.513224538005086e-05, "loss": 0.1751, "num_input_tokens_seen": 2068424, "step": 10845 }, { "epoch": 5.639293139293139, "grad_norm": 0.09207814186811447, "learning_rate": 4.512552051064761e-05, "loss": 0.0834, "num_input_tokens_seen": 2069448, "step": 10850 }, { "epoch": 5.641891891891892, "grad_norm": 0.5921396017074585, "learning_rate": 4.5118791500945207e-05, "loss": 0.2147, "num_input_tokens_seen": 2070344, "step": 10855 }, { "epoch": 5.644490644490644, "grad_norm": 0.49717995524406433, "learning_rate": 4.511205835232796e-05, "loss": 0.1234, "num_input_tokens_seen": 2071304, "step": 10860 }, { "epoch": 5.647089397089397, "grad_norm": 1.012526273727417, "learning_rate": 4.5105321066181036e-05, "loss": 0.1816, "num_input_tokens_seen": 2072264, "step": 10865 }, { "epoch": 5.649688149688149, "grad_norm": 0.4740554988384247, "learning_rate": 4.509857964389046e-05, "loss": 0.115, "num_input_tokens_seen": 2073224, "step": 10870 }, { "epoch": 5.652286902286902, "grad_norm": 0.5401517748832703, "learning_rate": 4.509183408684311e-05, "loss": 0.0777, "num_input_tokens_seen": 2074216, "step": 10875 }, { "epoch": 5.654885654885655, "grad_norm": 0.43396100401878357, "learning_rate": 4.50850843964267e-05, "loss": 0.0893, "num_input_tokens_seen": 2075240, "step": 10880 }, { "epoch": 5.657484407484407, "grad_norm": 0.20802396535873413, "learning_rate": 4.5078330574029804e-05, "loss": 0.2435, "num_input_tokens_seen": 2076232, "step": 10885 }, { "epoch": 5.66008316008316, "grad_norm": 0.4917156398296356, "learning_rate": 4.5071572621041834e-05, "loss": 0.1765, "num_input_tokens_seen": 2077224, "step": 10890 }, { "epoch": 5.662681912681912, "grad_norm": 1.1057567596435547, "learning_rate": 4.506481053885308e-05, "loss": 0.1684, "num_input_tokens_seen": 2078120, "step": 10895 }, { "epoch": 5.665280665280665, "grad_norm": 0.41705262660980225, "learning_rate": 4.505804432885464e-05, "loss": 0.0977, "num_input_tokens_seen": 2079016, "step": 10900 }, { "epoch": 5.667879417879417, "grad_norm": 1.2207942008972168, "learning_rate": 4.505127399243851e-05, "loss": 0.133, "num_input_tokens_seen": 2079976, "step": 10905 }, { "epoch": 5.67047817047817, "grad_norm": 0.059464603662490845, "learning_rate": 4.50444995309975e-05, "loss": 0.0555, "num_input_tokens_seen": 2080904, "step": 10910 }, { "epoch": 5.673076923076923, "grad_norm": 0.8383612632751465, "learning_rate": 4.5037720945925276e-05, "loss": 0.2363, "num_input_tokens_seen": 2081832, "step": 10915 }, { "epoch": 5.675675675675675, "grad_norm": 0.29020923376083374, "learning_rate": 4.503093823861635e-05, "loss": 0.1588, "num_input_tokens_seen": 2082824, "step": 10920 }, { "epoch": 5.678274428274428, "grad_norm": 0.3958486318588257, "learning_rate": 4.5024151410466096e-05, "loss": 0.0981, "num_input_tokens_seen": 2083784, "step": 10925 }, { "epoch": 5.6808731808731805, "grad_norm": 0.19240517914295197, "learning_rate": 4.501736046287072e-05, "loss": 0.1075, "num_input_tokens_seen": 2084744, "step": 10930 }, { "epoch": 5.6834719334719335, "grad_norm": 0.8903685212135315, "learning_rate": 4.5010565397227285e-05, "loss": 0.1266, "num_input_tokens_seen": 2085672, "step": 10935 }, { "epoch": 5.686070686070686, "grad_norm": 0.6687089204788208, "learning_rate": 4.5003766214933694e-05, "loss": 0.158, "num_input_tokens_seen": 2086632, "step": 10940 }, { "epoch": 5.6886694386694385, "grad_norm": 1.8062710762023926, "learning_rate": 4.499696291738871e-05, "loss": 0.1965, "num_input_tokens_seen": 2087528, "step": 10945 }, { "epoch": 5.6912681912681915, "grad_norm": 1.8936258554458618, "learning_rate": 4.4990155505991914e-05, "loss": 0.2092, "num_input_tokens_seen": 2088520, "step": 10950 }, { "epoch": 5.693866943866944, "grad_norm": 0.11693195253610611, "learning_rate": 4.498334398214378e-05, "loss": 0.0897, "num_input_tokens_seen": 2089512, "step": 10955 }, { "epoch": 5.696465696465697, "grad_norm": 0.8507768511772156, "learning_rate": 4.497652834724558e-05, "loss": 0.1495, "num_input_tokens_seen": 2090408, "step": 10960 }, { "epoch": 5.6990644490644495, "grad_norm": 2.477823257446289, "learning_rate": 4.4969708602699456e-05, "loss": 0.2684, "num_input_tokens_seen": 2091400, "step": 10965 }, { "epoch": 5.701663201663202, "grad_norm": 0.416723370552063, "learning_rate": 4.4962884749908394e-05, "loss": 0.0822, "num_input_tokens_seen": 2092328, "step": 10970 }, { "epoch": 5.704261954261955, "grad_norm": 0.40513598918914795, "learning_rate": 4.495605679027624e-05, "loss": 0.1407, "num_input_tokens_seen": 2093288, "step": 10975 }, { "epoch": 5.706860706860707, "grad_norm": 0.08927688747644424, "learning_rate": 4.4949224725207626e-05, "loss": 0.1964, "num_input_tokens_seen": 2094312, "step": 10980 }, { "epoch": 5.70945945945946, "grad_norm": 0.893894374370575, "learning_rate": 4.494238855610812e-05, "loss": 0.1278, "num_input_tokens_seen": 2095272, "step": 10985 }, { "epoch": 5.712058212058212, "grad_norm": 1.1912283897399902, "learning_rate": 4.493554828438404e-05, "loss": 0.2416, "num_input_tokens_seen": 2096296, "step": 10990 }, { "epoch": 5.714656964656965, "grad_norm": 0.8037964105606079, "learning_rate": 4.492870391144263e-05, "loss": 0.144, "num_input_tokens_seen": 2097224, "step": 10995 }, { "epoch": 5.717255717255718, "grad_norm": 0.7508125901222229, "learning_rate": 4.492185543869192e-05, "loss": 0.1731, "num_input_tokens_seen": 2098216, "step": 11000 }, { "epoch": 5.71985446985447, "grad_norm": 0.810197114944458, "learning_rate": 4.491500286754081e-05, "loss": 0.1499, "num_input_tokens_seen": 2099112, "step": 11005 }, { "epoch": 5.722453222453223, "grad_norm": 1.0520185232162476, "learning_rate": 4.490814619939904e-05, "loss": 0.1142, "num_input_tokens_seen": 2100040, "step": 11010 }, { "epoch": 5.725051975051975, "grad_norm": 0.538820743560791, "learning_rate": 4.4901285435677186e-05, "loss": 0.106, "num_input_tokens_seen": 2101032, "step": 11015 }, { "epoch": 5.727650727650728, "grad_norm": 0.5491010546684265, "learning_rate": 4.489442057778667e-05, "loss": 0.127, "num_input_tokens_seen": 2102024, "step": 11020 }, { "epoch": 5.73024948024948, "grad_norm": 0.2965851426124573, "learning_rate": 4.488755162713975e-05, "loss": 0.0892, "num_input_tokens_seen": 2102888, "step": 11025 }, { "epoch": 5.732848232848233, "grad_norm": 0.7757210731506348, "learning_rate": 4.488067858514955e-05, "loss": 0.1175, "num_input_tokens_seen": 2103880, "step": 11030 }, { "epoch": 5.735446985446986, "grad_norm": 0.8421044945716858, "learning_rate": 4.487380145323e-05, "loss": 0.1278, "num_input_tokens_seen": 2104872, "step": 11035 }, { "epoch": 5.738045738045738, "grad_norm": 0.5956956744194031, "learning_rate": 4.48669202327959e-05, "loss": 0.2834, "num_input_tokens_seen": 2105832, "step": 11040 }, { "epoch": 5.740644490644491, "grad_norm": 0.9032736420631409, "learning_rate": 4.486003492526289e-05, "loss": 0.0984, "num_input_tokens_seen": 2106792, "step": 11045 }, { "epoch": 5.743243243243243, "grad_norm": 0.35172751545906067, "learning_rate": 4.4853145532047424e-05, "loss": 0.1782, "num_input_tokens_seen": 2107720, "step": 11050 }, { "epoch": 5.745841995841996, "grad_norm": 0.049175046384334564, "learning_rate": 4.484625205456683e-05, "loss": 0.1896, "num_input_tokens_seen": 2108616, "step": 11055 }, { "epoch": 5.748440748440748, "grad_norm": 0.5850111842155457, "learning_rate": 4.4839354494239236e-05, "loss": 0.0683, "num_input_tokens_seen": 2109608, "step": 11060 }, { "epoch": 5.751039501039501, "grad_norm": 1.517465591430664, "learning_rate": 4.483245285248366e-05, "loss": 0.2704, "num_input_tokens_seen": 2110632, "step": 11065 }, { "epoch": 5.753638253638254, "grad_norm": 0.4891008138656616, "learning_rate": 4.482554713071993e-05, "loss": 0.1528, "num_input_tokens_seen": 2111560, "step": 11070 }, { "epoch": 5.756237006237006, "grad_norm": 0.4568317234516144, "learning_rate": 4.481863733036872e-05, "loss": 0.0957, "num_input_tokens_seen": 2112616, "step": 11075 }, { "epoch": 5.758835758835759, "grad_norm": 1.569357991218567, "learning_rate": 4.4811723452851514e-05, "loss": 0.1643, "num_input_tokens_seen": 2113544, "step": 11080 }, { "epoch": 5.761434511434511, "grad_norm": 1.2096600532531738, "learning_rate": 4.4804805499590693e-05, "loss": 0.2333, "num_input_tokens_seen": 2114504, "step": 11085 }, { "epoch": 5.764033264033264, "grad_norm": 0.2275230586528778, "learning_rate": 4.4797883472009435e-05, "loss": 0.0447, "num_input_tokens_seen": 2115400, "step": 11090 }, { "epoch": 5.766632016632016, "grad_norm": 0.12118545174598694, "learning_rate": 4.479095737153176e-05, "loss": 0.2239, "num_input_tokens_seen": 2116296, "step": 11095 }, { "epoch": 5.769230769230769, "grad_norm": 0.9119638800621033, "learning_rate": 4.4784027199582535e-05, "loss": 0.1315, "num_input_tokens_seen": 2117288, "step": 11100 }, { "epoch": 5.771829521829522, "grad_norm": 0.22733943164348602, "learning_rate": 4.4777092957587464e-05, "loss": 0.2182, "num_input_tokens_seen": 2118280, "step": 11105 }, { "epoch": 5.774428274428274, "grad_norm": 0.6402826905250549, "learning_rate": 4.4770154646973084e-05, "loss": 0.2101, "num_input_tokens_seen": 2119272, "step": 11110 }, { "epoch": 5.777027027027027, "grad_norm": 0.33822083473205566, "learning_rate": 4.476321226916677e-05, "loss": 0.178, "num_input_tokens_seen": 2120232, "step": 11115 }, { "epoch": 5.779625779625779, "grad_norm": 1.606250524520874, "learning_rate": 4.4756265825596733e-05, "loss": 0.1851, "num_input_tokens_seen": 2121192, "step": 11120 }, { "epoch": 5.782224532224532, "grad_norm": 0.6452394723892212, "learning_rate": 4.474931531769202e-05, "loss": 0.1615, "num_input_tokens_seen": 2122184, "step": 11125 }, { "epoch": 5.784823284823284, "grad_norm": 0.6032200455665588, "learning_rate": 4.4742360746882534e-05, "loss": 0.101, "num_input_tokens_seen": 2123016, "step": 11130 }, { "epoch": 5.787422037422037, "grad_norm": 0.7877160310745239, "learning_rate": 4.4735402114598974e-05, "loss": 0.1657, "num_input_tokens_seen": 2123976, "step": 11135 }, { "epoch": 5.79002079002079, "grad_norm": 0.7026074528694153, "learning_rate": 4.4728439422272904e-05, "loss": 0.0945, "num_input_tokens_seen": 2124936, "step": 11140 }, { "epoch": 5.792619542619542, "grad_norm": 0.5321882963180542, "learning_rate": 4.4721472671336724e-05, "loss": 0.0756, "num_input_tokens_seen": 2125832, "step": 11145 }, { "epoch": 5.795218295218295, "grad_norm": 0.8426967859268188, "learning_rate": 4.4714501863223646e-05, "loss": 0.164, "num_input_tokens_seen": 2126760, "step": 11150 }, { "epoch": 5.797817047817047, "grad_norm": 1.2949719429016113, "learning_rate": 4.4707526999367746e-05, "loss": 0.2225, "num_input_tokens_seen": 2127752, "step": 11155 }, { "epoch": 5.8004158004158, "grad_norm": 0.6202715039253235, "learning_rate": 4.4700548081203905e-05, "loss": 0.111, "num_input_tokens_seen": 2128776, "step": 11160 }, { "epoch": 5.803014553014553, "grad_norm": 0.707940936088562, "learning_rate": 4.469356511016786e-05, "loss": 0.1307, "num_input_tokens_seen": 2129704, "step": 11165 }, { "epoch": 5.8056133056133055, "grad_norm": 0.2720872461795807, "learning_rate": 4.468657808769619e-05, "loss": 0.1408, "num_input_tokens_seen": 2130632, "step": 11170 }, { "epoch": 5.808212058212058, "grad_norm": 0.18087129294872284, "learning_rate": 4.4679587015226253e-05, "loss": 0.1165, "num_input_tokens_seen": 2131592, "step": 11175 }, { "epoch": 5.8108108108108105, "grad_norm": 0.7453490495681763, "learning_rate": 4.4672591894196316e-05, "loss": 0.1302, "num_input_tokens_seen": 2132552, "step": 11180 }, { "epoch": 5.8134095634095635, "grad_norm": 0.24707669019699097, "learning_rate": 4.466559272604544e-05, "loss": 0.0554, "num_input_tokens_seen": 2133512, "step": 11185 }, { "epoch": 5.8160083160083165, "grad_norm": 0.9008065462112427, "learning_rate": 4.465858951221349e-05, "loss": 0.145, "num_input_tokens_seen": 2134440, "step": 11190 }, { "epoch": 5.8186070686070686, "grad_norm": 0.6436254978179932, "learning_rate": 4.465158225414122e-05, "loss": 0.206, "num_input_tokens_seen": 2135400, "step": 11195 }, { "epoch": 5.8212058212058215, "grad_norm": 0.808665931224823, "learning_rate": 4.464457095327018e-05, "loss": 0.2536, "num_input_tokens_seen": 2136392, "step": 11200 }, { "epoch": 5.823804573804574, "grad_norm": 0.8808644413948059, "learning_rate": 4.4637555611042776e-05, "loss": 0.1172, "num_input_tokens_seen": 2137320, "step": 11205 }, { "epoch": 5.826403326403327, "grad_norm": 0.42627662420272827, "learning_rate": 4.4630536228902215e-05, "loss": 0.0678, "num_input_tokens_seen": 2138248, "step": 11210 }, { "epoch": 5.829002079002079, "grad_norm": 0.9491948485374451, "learning_rate": 4.462351280829256e-05, "loss": 0.2585, "num_input_tokens_seen": 2139208, "step": 11215 }, { "epoch": 5.831600831600832, "grad_norm": 0.0798887237906456, "learning_rate": 4.4616485350658685e-05, "loss": 0.1677, "num_input_tokens_seen": 2140168, "step": 11220 }, { "epoch": 5.834199584199585, "grad_norm": 0.21969130635261536, "learning_rate": 4.4609453857446315e-05, "loss": 0.1817, "num_input_tokens_seen": 2141160, "step": 11225 }, { "epoch": 5.836798336798337, "grad_norm": 1.1722298860549927, "learning_rate": 4.4602418330102e-05, "loss": 0.103, "num_input_tokens_seen": 2142088, "step": 11230 }, { "epoch": 5.83939708939709, "grad_norm": 0.4007734954357147, "learning_rate": 4.45953787700731e-05, "loss": 0.2145, "num_input_tokens_seen": 2143048, "step": 11235 }, { "epoch": 5.841995841995842, "grad_norm": 0.1528102457523346, "learning_rate": 4.458833517880784e-05, "loss": 0.1981, "num_input_tokens_seen": 2144040, "step": 11240 }, { "epoch": 5.844594594594595, "grad_norm": 0.25730523467063904, "learning_rate": 4.4581287557755235e-05, "loss": 0.1701, "num_input_tokens_seen": 2145064, "step": 11245 }, { "epoch": 5.847193347193347, "grad_norm": 0.16663020849227905, "learning_rate": 4.4574235908365166e-05, "loss": 0.1598, "num_input_tokens_seen": 2145960, "step": 11250 }, { "epoch": 5.8497920997921, "grad_norm": 1.022268533706665, "learning_rate": 4.456718023208831e-05, "loss": 0.2977, "num_input_tokens_seen": 2146920, "step": 11255 }, { "epoch": 5.852390852390853, "grad_norm": 0.23949669301509857, "learning_rate": 4.45601205303762e-05, "loss": 0.1166, "num_input_tokens_seen": 2147880, "step": 11260 }, { "epoch": 5.854989604989605, "grad_norm": 1.303824543952942, "learning_rate": 4.455305680468118e-05, "loss": 0.2066, "num_input_tokens_seen": 2148744, "step": 11265 }, { "epoch": 5.857588357588358, "grad_norm": 0.47572124004364014, "learning_rate": 4.4545989056456426e-05, "loss": 0.1316, "num_input_tokens_seen": 2149672, "step": 11270 }, { "epoch": 5.86018711018711, "grad_norm": 0.782240629196167, "learning_rate": 4.453891728715595e-05, "loss": 0.1098, "num_input_tokens_seen": 2150568, "step": 11275 }, { "epoch": 5.862785862785863, "grad_norm": 0.4950558841228485, "learning_rate": 4.453184149823456e-05, "loss": 0.228, "num_input_tokens_seen": 2151560, "step": 11280 }, { "epoch": 5.865384615384615, "grad_norm": 0.450574666261673, "learning_rate": 4.452476169114794e-05, "loss": 0.1753, "num_input_tokens_seen": 2152456, "step": 11285 }, { "epoch": 5.867983367983368, "grad_norm": 0.7708876729011536, "learning_rate": 4.4517677867352556e-05, "loss": 0.1474, "num_input_tokens_seen": 2153416, "step": 11290 }, { "epoch": 5.870582120582121, "grad_norm": 0.250210165977478, "learning_rate": 4.451059002830574e-05, "loss": 0.1418, "num_input_tokens_seen": 2154376, "step": 11295 }, { "epoch": 5.873180873180873, "grad_norm": 0.41698282957077026, "learning_rate": 4.450349817546562e-05, "loss": 0.2166, "num_input_tokens_seen": 2155304, "step": 11300 }, { "epoch": 5.875779625779626, "grad_norm": 0.4614478051662445, "learning_rate": 4.449640231029115e-05, "loss": 0.0952, "num_input_tokens_seen": 2156296, "step": 11305 }, { "epoch": 5.878378378378378, "grad_norm": 0.49752724170684814, "learning_rate": 4.448930243424213e-05, "loss": 0.1673, "num_input_tokens_seen": 2157256, "step": 11310 }, { "epoch": 5.880977130977131, "grad_norm": 0.11650007963180542, "learning_rate": 4.4482198548779176e-05, "loss": 0.1746, "num_input_tokens_seen": 2158248, "step": 11315 }, { "epoch": 5.883575883575883, "grad_norm": 0.6780220866203308, "learning_rate": 4.447509065536371e-05, "loss": 0.1735, "num_input_tokens_seen": 2159176, "step": 11320 }, { "epoch": 5.886174636174636, "grad_norm": 0.13429422676563263, "learning_rate": 4.446797875545801e-05, "loss": 0.171, "num_input_tokens_seen": 2160136, "step": 11325 }, { "epoch": 5.888773388773389, "grad_norm": 0.6698275804519653, "learning_rate": 4.446086285052515e-05, "loss": 0.259, "num_input_tokens_seen": 2161096, "step": 11330 }, { "epoch": 5.891372141372141, "grad_norm": 0.9714813232421875, "learning_rate": 4.445374294202907e-05, "loss": 0.1421, "num_input_tokens_seen": 2161992, "step": 11335 }, { "epoch": 5.893970893970894, "grad_norm": 0.5731059312820435, "learning_rate": 4.4446619031434464e-05, "loss": 0.2091, "num_input_tokens_seen": 2162984, "step": 11340 }, { "epoch": 5.896569646569646, "grad_norm": 0.8604015707969666, "learning_rate": 4.4439491120206914e-05, "loss": 0.2257, "num_input_tokens_seen": 2163880, "step": 11345 }, { "epoch": 5.899168399168399, "grad_norm": 0.25353071093559265, "learning_rate": 4.4432359209812805e-05, "loss": 0.1335, "num_input_tokens_seen": 2164936, "step": 11350 }, { "epoch": 5.901767151767151, "grad_norm": 0.42744725942611694, "learning_rate": 4.4425223301719335e-05, "loss": 0.0984, "num_input_tokens_seen": 2165864, "step": 11355 }, { "epoch": 5.904365904365904, "grad_norm": 0.23502084612846375, "learning_rate": 4.441808339739453e-05, "loss": 0.1916, "num_input_tokens_seen": 2166824, "step": 11360 }, { "epoch": 5.906964656964657, "grad_norm": 0.5112816095352173, "learning_rate": 4.441093949830723e-05, "loss": 0.1561, "num_input_tokens_seen": 2167752, "step": 11365 }, { "epoch": 5.909563409563409, "grad_norm": 0.6692410111427307, "learning_rate": 4.440379160592711e-05, "loss": 0.1327, "num_input_tokens_seen": 2168680, "step": 11370 }, { "epoch": 5.912162162162162, "grad_norm": 0.12942016124725342, "learning_rate": 4.4396639721724676e-05, "loss": 0.1582, "num_input_tokens_seen": 2169640, "step": 11375 }, { "epoch": 5.914760914760915, "grad_norm": 0.3021777272224426, "learning_rate": 4.438948384717122e-05, "loss": 0.1079, "num_input_tokens_seen": 2170600, "step": 11380 }, { "epoch": 5.917359667359667, "grad_norm": 0.620934009552002, "learning_rate": 4.43823239837389e-05, "loss": 0.1439, "num_input_tokens_seen": 2171560, "step": 11385 }, { "epoch": 5.91995841995842, "grad_norm": 0.2055029571056366, "learning_rate": 4.437516013290065e-05, "loss": 0.1119, "num_input_tokens_seen": 2172520, "step": 11390 }, { "epoch": 5.922557172557172, "grad_norm": 0.12388577312231064, "learning_rate": 4.4367992296130244e-05, "loss": 0.1334, "num_input_tokens_seen": 2173384, "step": 11395 }, { "epoch": 5.925155925155925, "grad_norm": 1.9174836874008179, "learning_rate": 4.4360820474902295e-05, "loss": 0.1595, "num_input_tokens_seen": 2174312, "step": 11400 }, { "epoch": 5.9277546777546775, "grad_norm": 1.4008654356002808, "learning_rate": 4.4353644670692196e-05, "loss": 0.1729, "num_input_tokens_seen": 2175272, "step": 11405 }, { "epoch": 5.93035343035343, "grad_norm": 1.083258032798767, "learning_rate": 4.43464648849762e-05, "loss": 0.1299, "num_input_tokens_seen": 2176232, "step": 11410 }, { "epoch": 5.932952182952183, "grad_norm": 1.5986486673355103, "learning_rate": 4.4339281119231345e-05, "loss": 0.1591, "num_input_tokens_seen": 2177160, "step": 11415 }, { "epoch": 5.9355509355509355, "grad_norm": 0.7811967730522156, "learning_rate": 4.433209337493551e-05, "loss": 0.1297, "num_input_tokens_seen": 2178056, "step": 11420 }, { "epoch": 5.9381496881496885, "grad_norm": 0.05266103520989418, "learning_rate": 4.4324901653567386e-05, "loss": 0.0985, "num_input_tokens_seen": 2179016, "step": 11425 }, { "epoch": 5.9407484407484406, "grad_norm": 0.7838184833526611, "learning_rate": 4.431770595660649e-05, "loss": 0.1172, "num_input_tokens_seen": 2179976, "step": 11430 }, { "epoch": 5.9433471933471935, "grad_norm": 0.3358771800994873, "learning_rate": 4.4310506285533124e-05, "loss": 0.2041, "num_input_tokens_seen": 2180904, "step": 11435 }, { "epoch": 5.945945945945946, "grad_norm": 0.70784592628479, "learning_rate": 4.430330264182845e-05, "loss": 0.1247, "num_input_tokens_seen": 2181864, "step": 11440 }, { "epoch": 5.948544698544699, "grad_norm": 0.7283633947372437, "learning_rate": 4.4296095026974426e-05, "loss": 0.1133, "num_input_tokens_seen": 2182792, "step": 11445 }, { "epoch": 5.951143451143452, "grad_norm": 0.07641790062189102, "learning_rate": 4.4288883442453835e-05, "loss": 0.0767, "num_input_tokens_seen": 2183752, "step": 11450 }, { "epoch": 5.953742203742204, "grad_norm": 0.6776378750801086, "learning_rate": 4.428166788975026e-05, "loss": 0.1969, "num_input_tokens_seen": 2184680, "step": 11455 }, { "epoch": 5.956340956340957, "grad_norm": 0.4822911322116852, "learning_rate": 4.427444837034812e-05, "loss": 0.168, "num_input_tokens_seen": 2185704, "step": 11460 }, { "epoch": 5.958939708939709, "grad_norm": 0.1400527060031891, "learning_rate": 4.426722488573264e-05, "loss": 0.1833, "num_input_tokens_seen": 2186568, "step": 11465 }, { "epoch": 5.961538461538462, "grad_norm": 0.2681649327278137, "learning_rate": 4.425999743738987e-05, "loss": 0.08, "num_input_tokens_seen": 2187464, "step": 11470 }, { "epoch": 5.964137214137214, "grad_norm": 0.9590285420417786, "learning_rate": 4.4252766026806646e-05, "loss": 0.1399, "num_input_tokens_seen": 2188424, "step": 11475 }, { "epoch": 5.966735966735967, "grad_norm": 0.06681091338396072, "learning_rate": 4.424553065547068e-05, "loss": 0.0488, "num_input_tokens_seen": 2189384, "step": 11480 }, { "epoch": 5.96933471933472, "grad_norm": 0.9600732326507568, "learning_rate": 4.4238291324870415e-05, "loss": 0.2385, "num_input_tokens_seen": 2190280, "step": 11485 }, { "epoch": 5.971933471933472, "grad_norm": 0.5006200671195984, "learning_rate": 4.4231048036495196e-05, "loss": 0.2041, "num_input_tokens_seen": 2191240, "step": 11490 }, { "epoch": 5.974532224532225, "grad_norm": 0.23452934622764587, "learning_rate": 4.4223800791835105e-05, "loss": 0.1163, "num_input_tokens_seen": 2192168, "step": 11495 }, { "epoch": 5.977130977130977, "grad_norm": 1.0915545225143433, "learning_rate": 4.42165495923811e-05, "loss": 0.1925, "num_input_tokens_seen": 2193128, "step": 11500 }, { "epoch": 5.97972972972973, "grad_norm": 0.1535671502351761, "learning_rate": 4.4209294439624906e-05, "loss": 0.0916, "num_input_tokens_seen": 2194056, "step": 11505 }, { "epoch": 5.982328482328482, "grad_norm": 0.49315834045410156, "learning_rate": 4.420203533505909e-05, "loss": 0.2287, "num_input_tokens_seen": 2195048, "step": 11510 }, { "epoch": 5.984927234927235, "grad_norm": 0.3380260169506073, "learning_rate": 4.419477228017703e-05, "loss": 0.1519, "num_input_tokens_seen": 2196008, "step": 11515 }, { "epoch": 5.987525987525988, "grad_norm": 0.3676017224788666, "learning_rate": 4.418750527647289e-05, "loss": 0.1552, "num_input_tokens_seen": 2196968, "step": 11520 }, { "epoch": 5.99012474012474, "grad_norm": 0.5551671981811523, "learning_rate": 4.418023432544168e-05, "loss": 0.107, "num_input_tokens_seen": 2197896, "step": 11525 }, { "epoch": 5.992723492723493, "grad_norm": 0.5210845470428467, "learning_rate": 4.417295942857921e-05, "loss": 0.2149, "num_input_tokens_seen": 2198888, "step": 11530 }, { "epoch": 5.995322245322245, "grad_norm": 0.4037236273288727, "learning_rate": 4.4165680587382094e-05, "loss": 0.1447, "num_input_tokens_seen": 2199848, "step": 11535 }, { "epoch": 5.997920997920998, "grad_norm": 0.6966464519500732, "learning_rate": 4.4158397803347764e-05, "loss": 0.1661, "num_input_tokens_seen": 2200776, "step": 11540 }, { "epoch": 6.0, "eval_loss": 0.1500324010848999, "eval_runtime": 8.0218, "eval_samples_per_second": 106.709, "eval_steps_per_second": 26.677, "num_input_tokens_seen": 2201464, "step": 11544 }, { "epoch": 6.000519750519751, "grad_norm": 0.39733386039733887, "learning_rate": 4.415111107797445e-05, "loss": 0.1079, "num_input_tokens_seen": 2201624, "step": 11545 }, { "epoch": 6.003118503118503, "grad_norm": 0.5592541098594666, "learning_rate": 4.4143820412761226e-05, "loss": 0.0647, "num_input_tokens_seen": 2202584, "step": 11550 }, { "epoch": 6.005717255717256, "grad_norm": 0.5649378895759583, "learning_rate": 4.4136525809207954e-05, "loss": 0.1591, "num_input_tokens_seen": 2203480, "step": 11555 }, { "epoch": 6.008316008316008, "grad_norm": 0.7586964964866638, "learning_rate": 4.412922726881529e-05, "loss": 0.2352, "num_input_tokens_seen": 2204408, "step": 11560 }, { "epoch": 6.010914760914761, "grad_norm": 0.9958995580673218, "learning_rate": 4.412192479308473e-05, "loss": 0.2375, "num_input_tokens_seen": 2205368, "step": 11565 }, { "epoch": 6.013513513513513, "grad_norm": 0.7054421305656433, "learning_rate": 4.4114618383518563e-05, "loss": 0.1294, "num_input_tokens_seen": 2206360, "step": 11570 }, { "epoch": 6.016112266112266, "grad_norm": 0.5786017179489136, "learning_rate": 4.41073080416199e-05, "loss": 0.1564, "num_input_tokens_seen": 2207320, "step": 11575 }, { "epoch": 6.018711018711019, "grad_norm": 0.6226568818092346, "learning_rate": 4.4099993768892635e-05, "loss": 0.0695, "num_input_tokens_seen": 2208280, "step": 11580 }, { "epoch": 6.021309771309771, "grad_norm": 0.6626380085945129, "learning_rate": 4.409267556684151e-05, "loss": 0.1026, "num_input_tokens_seen": 2209208, "step": 11585 }, { "epoch": 6.023908523908524, "grad_norm": 0.32684051990509033, "learning_rate": 4.408535343697204e-05, "loss": 0.1109, "num_input_tokens_seen": 2210072, "step": 11590 }, { "epoch": 6.026507276507276, "grad_norm": 0.9200376272201538, "learning_rate": 4.407802738079057e-05, "loss": 0.1476, "num_input_tokens_seen": 2211032, "step": 11595 }, { "epoch": 6.029106029106029, "grad_norm": 0.26877230405807495, "learning_rate": 4.407069739980423e-05, "loss": 0.1573, "num_input_tokens_seen": 2212024, "step": 11600 }, { "epoch": 6.031704781704781, "grad_norm": 0.29027611017227173, "learning_rate": 4.4063363495520984e-05, "loss": 0.1449, "num_input_tokens_seen": 2212984, "step": 11605 }, { "epoch": 6.034303534303534, "grad_norm": 0.22813796997070312, "learning_rate": 4.4056025669449584e-05, "loss": 0.1123, "num_input_tokens_seen": 2213912, "step": 11610 }, { "epoch": 6.036902286902287, "grad_norm": 0.7358130216598511, "learning_rate": 4.404868392309961e-05, "loss": 0.1604, "num_input_tokens_seen": 2214872, "step": 11615 }, { "epoch": 6.039501039501039, "grad_norm": 0.588776171207428, "learning_rate": 4.404133825798141e-05, "loss": 0.2266, "num_input_tokens_seen": 2215800, "step": 11620 }, { "epoch": 6.042099792099792, "grad_norm": 0.1948794275522232, "learning_rate": 4.4033988675606184e-05, "loss": 0.1656, "num_input_tokens_seen": 2216760, "step": 11625 }, { "epoch": 6.044698544698544, "grad_norm": 1.0432053804397583, "learning_rate": 4.40266351774859e-05, "loss": 0.1806, "num_input_tokens_seen": 2217752, "step": 11630 }, { "epoch": 6.047297297297297, "grad_norm": 0.7831348776817322, "learning_rate": 4.401927776513336e-05, "loss": 0.2312, "num_input_tokens_seen": 2218680, "step": 11635 }, { "epoch": 6.04989604989605, "grad_norm": 0.951896071434021, "learning_rate": 4.401191644006216e-05, "loss": 0.1435, "num_input_tokens_seen": 2219704, "step": 11640 }, { "epoch": 6.052494802494802, "grad_norm": 1.1713659763336182, "learning_rate": 4.400455120378669e-05, "loss": 0.2015, "num_input_tokens_seen": 2220536, "step": 11645 }, { "epoch": 6.055093555093555, "grad_norm": 0.44953054189682007, "learning_rate": 4.3997182057822166e-05, "loss": 0.117, "num_input_tokens_seen": 2221496, "step": 11650 }, { "epoch": 6.0576923076923075, "grad_norm": 1.3184453248977661, "learning_rate": 4.398980900368459e-05, "loss": 0.1958, "num_input_tokens_seen": 2222488, "step": 11655 }, { "epoch": 6.0602910602910605, "grad_norm": 1.914171814918518, "learning_rate": 4.398243204289077e-05, "loss": 0.1758, "num_input_tokens_seen": 2223384, "step": 11660 }, { "epoch": 6.0628898128898125, "grad_norm": 0.03815937042236328, "learning_rate": 4.3975051176958326e-05, "loss": 0.1302, "num_input_tokens_seen": 2224344, "step": 11665 }, { "epoch": 6.0654885654885655, "grad_norm": 0.5089296698570251, "learning_rate": 4.3967666407405686e-05, "loss": 0.1108, "num_input_tokens_seen": 2225304, "step": 11670 }, { "epoch": 6.0680873180873185, "grad_norm": 0.708353579044342, "learning_rate": 4.3960277735752066e-05, "loss": 0.1377, "num_input_tokens_seen": 2226232, "step": 11675 }, { "epoch": 6.070686070686071, "grad_norm": 0.4165703058242798, "learning_rate": 4.395288516351749e-05, "loss": 0.1573, "num_input_tokens_seen": 2227128, "step": 11680 }, { "epoch": 6.0732848232848236, "grad_norm": 0.2500089108943939, "learning_rate": 4.3945488692222784e-05, "loss": 0.1602, "num_input_tokens_seen": 2228120, "step": 11685 }, { "epoch": 6.075883575883576, "grad_norm": 0.6125885248184204, "learning_rate": 4.393808832338959e-05, "loss": 0.0878, "num_input_tokens_seen": 2229016, "step": 11690 }, { "epoch": 6.078482328482329, "grad_norm": 1.1936732530593872, "learning_rate": 4.3930684058540336e-05, "loss": 0.1564, "num_input_tokens_seen": 2230008, "step": 11695 }, { "epoch": 6.081081081081081, "grad_norm": 0.29844507575035095, "learning_rate": 4.392327589919825e-05, "loss": 0.115, "num_input_tokens_seen": 2230936, "step": 11700 }, { "epoch": 6.083679833679834, "grad_norm": 0.47495219111442566, "learning_rate": 4.3915863846887376e-05, "loss": 0.2198, "num_input_tokens_seen": 2231960, "step": 11705 }, { "epoch": 6.086278586278587, "grad_norm": 1.1147342920303345, "learning_rate": 4.390844790313254e-05, "loss": 0.2243, "num_input_tokens_seen": 2232984, "step": 11710 }, { "epoch": 6.088877338877339, "grad_norm": 0.5477253198623657, "learning_rate": 4.390102806945938e-05, "loss": 0.1466, "num_input_tokens_seen": 2233944, "step": 11715 }, { "epoch": 6.091476091476092, "grad_norm": 0.4685265123844147, "learning_rate": 4.3893604347394334e-05, "loss": 0.1138, "num_input_tokens_seen": 2234904, "step": 11720 }, { "epoch": 6.094074844074844, "grad_norm": 0.0855516716837883, "learning_rate": 4.388617673846465e-05, "loss": 0.0866, "num_input_tokens_seen": 2235832, "step": 11725 }, { "epoch": 6.096673596673597, "grad_norm": 1.204153299331665, "learning_rate": 4.3878745244198345e-05, "loss": 0.0772, "num_input_tokens_seen": 2236824, "step": 11730 }, { "epoch": 6.099272349272349, "grad_norm": 0.6468479633331299, "learning_rate": 4.387130986612427e-05, "loss": 0.1037, "num_input_tokens_seen": 2237752, "step": 11735 }, { "epoch": 6.101871101871102, "grad_norm": 1.4753514528274536, "learning_rate": 4.386387060577204e-05, "loss": 0.1254, "num_input_tokens_seen": 2238744, "step": 11740 }, { "epoch": 6.104469854469855, "grad_norm": 1.0777403116226196, "learning_rate": 4.3856427464672113e-05, "loss": 0.193, "num_input_tokens_seen": 2239704, "step": 11745 }, { "epoch": 6.107068607068607, "grad_norm": 0.597189724445343, "learning_rate": 4.3848980444355705e-05, "loss": 0.1249, "num_input_tokens_seen": 2240632, "step": 11750 }, { "epoch": 6.10966735966736, "grad_norm": 1.0839377641677856, "learning_rate": 4.384152954635485e-05, "loss": 0.0964, "num_input_tokens_seen": 2241560, "step": 11755 }, { "epoch": 6.112266112266112, "grad_norm": 0.9667999148368835, "learning_rate": 4.3834074772202374e-05, "loss": 0.1656, "num_input_tokens_seen": 2242520, "step": 11760 }, { "epoch": 6.114864864864865, "grad_norm": 1.3168002367019653, "learning_rate": 4.38266161234319e-05, "loss": 0.1855, "num_input_tokens_seen": 2243512, "step": 11765 }, { "epoch": 6.117463617463618, "grad_norm": 0.5118000507354736, "learning_rate": 4.381915360157784e-05, "loss": 0.177, "num_input_tokens_seen": 2244504, "step": 11770 }, { "epoch": 6.12006237006237, "grad_norm": 0.5037721991539001, "learning_rate": 4.381168720817545e-05, "loss": 0.0701, "num_input_tokens_seen": 2245528, "step": 11775 }, { "epoch": 6.122661122661123, "grad_norm": 1.0058308839797974, "learning_rate": 4.38042169447607e-05, "loss": 0.1718, "num_input_tokens_seen": 2246520, "step": 11780 }, { "epoch": 6.125259875259875, "grad_norm": 0.6343212723731995, "learning_rate": 4.379674281287043e-05, "loss": 0.164, "num_input_tokens_seen": 2247480, "step": 11785 }, { "epoch": 6.127858627858628, "grad_norm": 2.2474799156188965, "learning_rate": 4.378926481404223e-05, "loss": 0.12, "num_input_tokens_seen": 2248440, "step": 11790 }, { "epoch": 6.13045738045738, "grad_norm": 0.118552066385746, "learning_rate": 4.378178294981451e-05, "loss": 0.0884, "num_input_tokens_seen": 2249368, "step": 11795 }, { "epoch": 6.133056133056133, "grad_norm": 0.19821996986865997, "learning_rate": 4.377429722172648e-05, "loss": 0.1356, "num_input_tokens_seen": 2250328, "step": 11800 }, { "epoch": 6.135654885654886, "grad_norm": 0.9583010077476501, "learning_rate": 4.3766807631318106e-05, "loss": 0.0976, "num_input_tokens_seen": 2251320, "step": 11805 }, { "epoch": 6.138253638253638, "grad_norm": 0.5013343095779419, "learning_rate": 4.37593141801302e-05, "loss": 0.1206, "num_input_tokens_seen": 2252312, "step": 11810 }, { "epoch": 6.140852390852391, "grad_norm": 0.5889897346496582, "learning_rate": 4.3751816869704334e-05, "loss": 0.1434, "num_input_tokens_seen": 2253304, "step": 11815 }, { "epoch": 6.143451143451143, "grad_norm": 0.5130176544189453, "learning_rate": 4.374431570158287e-05, "loss": 0.0562, "num_input_tokens_seen": 2254296, "step": 11820 }, { "epoch": 6.146049896049896, "grad_norm": 0.45903530716896057, "learning_rate": 4.3736810677309003e-05, "loss": 0.1839, "num_input_tokens_seen": 2255288, "step": 11825 }, { "epoch": 6.148648648648648, "grad_norm": 1.4489948749542236, "learning_rate": 4.372930179842668e-05, "loss": 0.1341, "num_input_tokens_seen": 2256280, "step": 11830 }, { "epoch": 6.151247401247401, "grad_norm": 0.7386794686317444, "learning_rate": 4.372178906648066e-05, "loss": 0.1917, "num_input_tokens_seen": 2257176, "step": 11835 }, { "epoch": 6.153846153846154, "grad_norm": 0.6017658710479736, "learning_rate": 4.3714272483016485e-05, "loss": 0.0691, "num_input_tokens_seen": 2258072, "step": 11840 }, { "epoch": 6.156444906444906, "grad_norm": 0.40169423818588257, "learning_rate": 4.370675204958049e-05, "loss": 0.1742, "num_input_tokens_seen": 2259000, "step": 11845 }, { "epoch": 6.159043659043659, "grad_norm": 0.5661694407463074, "learning_rate": 4.369922776771983e-05, "loss": 0.0792, "num_input_tokens_seen": 2259896, "step": 11850 }, { "epoch": 6.161642411642411, "grad_norm": 0.8248697519302368, "learning_rate": 4.3691699638982415e-05, "loss": 0.065, "num_input_tokens_seen": 2260728, "step": 11855 }, { "epoch": 6.164241164241164, "grad_norm": 0.3870883584022522, "learning_rate": 4.368416766491696e-05, "loss": 0.1921, "num_input_tokens_seen": 2261656, "step": 11860 }, { "epoch": 6.166839916839917, "grad_norm": 0.44788658618927, "learning_rate": 4.367663184707297e-05, "loss": 0.1033, "num_input_tokens_seen": 2262584, "step": 11865 }, { "epoch": 6.169438669438669, "grad_norm": 0.48865145444869995, "learning_rate": 4.366909218700075e-05, "loss": 0.1323, "num_input_tokens_seen": 2263480, "step": 11870 }, { "epoch": 6.172037422037422, "grad_norm": 0.4492849111557007, "learning_rate": 4.366154868625138e-05, "loss": 0.0992, "num_input_tokens_seen": 2264472, "step": 11875 }, { "epoch": 6.174636174636174, "grad_norm": 1.2089885473251343, "learning_rate": 4.365400134637674e-05, "loss": 0.2789, "num_input_tokens_seen": 2265432, "step": 11880 }, { "epoch": 6.177234927234927, "grad_norm": 0.8395290374755859, "learning_rate": 4.36464501689295e-05, "loss": 0.1584, "num_input_tokens_seen": 2266392, "step": 11885 }, { "epoch": 6.1798336798336795, "grad_norm": 0.9425389766693115, "learning_rate": 4.363889515546312e-05, "loss": 0.1407, "num_input_tokens_seen": 2267288, "step": 11890 }, { "epoch": 6.1824324324324325, "grad_norm": 0.7931290864944458, "learning_rate": 4.363133630753184e-05, "loss": 0.07, "num_input_tokens_seen": 2268216, "step": 11895 }, { "epoch": 6.185031185031185, "grad_norm": 0.43280476331710815, "learning_rate": 4.3623773626690696e-05, "loss": 0.1034, "num_input_tokens_seen": 2269112, "step": 11900 }, { "epoch": 6.1876299376299375, "grad_norm": 0.43622806668281555, "learning_rate": 4.3616207114495514e-05, "loss": 0.0723, "num_input_tokens_seen": 2270104, "step": 11905 }, { "epoch": 6.1902286902286905, "grad_norm": 1.0375730991363525, "learning_rate": 4.360863677250291e-05, "loss": 0.1382, "num_input_tokens_seen": 2271096, "step": 11910 }, { "epoch": 6.192827442827443, "grad_norm": 0.26112470030784607, "learning_rate": 4.360106260227027e-05, "loss": 0.0864, "num_input_tokens_seen": 2271992, "step": 11915 }, { "epoch": 6.1954261954261955, "grad_norm": 0.8619083166122437, "learning_rate": 4.35934846053558e-05, "loss": 0.1361, "num_input_tokens_seen": 2272920, "step": 11920 }, { "epoch": 6.198024948024948, "grad_norm": 0.4516182541847229, "learning_rate": 4.358590278331844e-05, "loss": 0.08, "num_input_tokens_seen": 2273880, "step": 11925 }, { "epoch": 6.200623700623701, "grad_norm": 0.3919229209423065, "learning_rate": 4.3578317137718e-05, "loss": 0.2201, "num_input_tokens_seen": 2274840, "step": 11930 }, { "epoch": 6.203222453222454, "grad_norm": 0.7575113773345947, "learning_rate": 4.3570727670115e-05, "loss": 0.2445, "num_input_tokens_seen": 2275800, "step": 11935 }, { "epoch": 6.205821205821206, "grad_norm": 1.089624047279358, "learning_rate": 4.3563134382070775e-05, "loss": 0.1213, "num_input_tokens_seen": 2276760, "step": 11940 }, { "epoch": 6.208419958419959, "grad_norm": 1.3382903337478638, "learning_rate": 4.355553727514745e-05, "loss": 0.2462, "num_input_tokens_seen": 2277688, "step": 11945 }, { "epoch": 6.211018711018711, "grad_norm": 0.16172949969768524, "learning_rate": 4.3547936350907924e-05, "loss": 0.1288, "num_input_tokens_seen": 2278616, "step": 11950 }, { "epoch": 6.213617463617464, "grad_norm": 1.1006978750228882, "learning_rate": 4.3540331610915905e-05, "loss": 0.3835, "num_input_tokens_seen": 2279576, "step": 11955 }, { "epoch": 6.216216216216216, "grad_norm": 0.509483277797699, "learning_rate": 4.353272305673585e-05, "loss": 0.1097, "num_input_tokens_seen": 2280536, "step": 11960 }, { "epoch": 6.218814968814969, "grad_norm": 0.2203962653875351, "learning_rate": 4.352511068993304e-05, "loss": 0.1228, "num_input_tokens_seen": 2281496, "step": 11965 }, { "epoch": 6.221413721413722, "grad_norm": 0.12086667120456696, "learning_rate": 4.3517494512073485e-05, "loss": 0.1027, "num_input_tokens_seen": 2282456, "step": 11970 }, { "epoch": 6.224012474012474, "grad_norm": 0.2909742593765259, "learning_rate": 4.350987452472406e-05, "loss": 0.1887, "num_input_tokens_seen": 2283416, "step": 11975 }, { "epoch": 6.226611226611227, "grad_norm": 0.41210344433784485, "learning_rate": 4.350225072945234e-05, "loss": 0.2729, "num_input_tokens_seen": 2284408, "step": 11980 }, { "epoch": 6.229209979209979, "grad_norm": 0.45957377552986145, "learning_rate": 4.349462312782674e-05, "loss": 0.1239, "num_input_tokens_seen": 2285272, "step": 11985 }, { "epoch": 6.231808731808732, "grad_norm": 0.4189331829547882, "learning_rate": 4.348699172141645e-05, "loss": 0.0983, "num_input_tokens_seen": 2286168, "step": 11990 }, { "epoch": 6.234407484407485, "grad_norm": 0.8668312430381775, "learning_rate": 4.3479356511791404e-05, "loss": 0.0678, "num_input_tokens_seen": 2287032, "step": 11995 }, { "epoch": 6.237006237006237, "grad_norm": 0.39444634318351746, "learning_rate": 4.3471717500522366e-05, "loss": 0.1768, "num_input_tokens_seen": 2287992, "step": 12000 }, { "epoch": 6.23960498960499, "grad_norm": 0.3341515362262726, "learning_rate": 4.3464074689180855e-05, "loss": 0.1278, "num_input_tokens_seen": 2288920, "step": 12005 }, { "epoch": 6.242203742203742, "grad_norm": 0.43098336458206177, "learning_rate": 4.345642807933919e-05, "loss": 0.0623, "num_input_tokens_seen": 2289880, "step": 12010 }, { "epoch": 6.244802494802495, "grad_norm": 0.5364782214164734, "learning_rate": 4.344877767257044e-05, "loss": 0.1502, "num_input_tokens_seen": 2290872, "step": 12015 }, { "epoch": 6.247401247401247, "grad_norm": 0.4641773998737335, "learning_rate": 4.34411234704485e-05, "loss": 0.1732, "num_input_tokens_seen": 2291896, "step": 12020 }, { "epoch": 6.25, "grad_norm": 0.6441538333892822, "learning_rate": 4.343346547454801e-05, "loss": 0.2053, "num_input_tokens_seen": 2292888, "step": 12025 }, { "epoch": 6.252598752598753, "grad_norm": 0.17544740438461304, "learning_rate": 4.342580368644439e-05, "loss": 0.1051, "num_input_tokens_seen": 2293784, "step": 12030 }, { "epoch": 6.255197505197505, "grad_norm": 0.765024721622467, "learning_rate": 4.341813810771388e-05, "loss": 0.0769, "num_input_tokens_seen": 2294712, "step": 12035 }, { "epoch": 6.257796257796258, "grad_norm": 0.7698157429695129, "learning_rate": 4.341046873993345e-05, "loss": 0.2165, "num_input_tokens_seen": 2295608, "step": 12040 }, { "epoch": 6.26039501039501, "grad_norm": 0.46202975511550903, "learning_rate": 4.340279558468088e-05, "loss": 0.2131, "num_input_tokens_seen": 2296568, "step": 12045 }, { "epoch": 6.262993762993763, "grad_norm": 0.7702754735946655, "learning_rate": 4.339511864353473e-05, "loss": 0.1331, "num_input_tokens_seen": 2297432, "step": 12050 }, { "epoch": 6.265592515592515, "grad_norm": 0.7013161182403564, "learning_rate": 4.3387437918074304e-05, "loss": 0.1363, "num_input_tokens_seen": 2298328, "step": 12055 }, { "epoch": 6.268191268191268, "grad_norm": 0.6157888174057007, "learning_rate": 4.337975340987973e-05, "loss": 0.0854, "num_input_tokens_seen": 2299224, "step": 12060 }, { "epoch": 6.270790020790021, "grad_norm": 0.6250498294830322, "learning_rate": 4.33720651205319e-05, "loss": 0.1934, "num_input_tokens_seen": 2300152, "step": 12065 }, { "epoch": 6.273388773388773, "grad_norm": 0.24402420222759247, "learning_rate": 4.336437305161246e-05, "loss": 0.1006, "num_input_tokens_seen": 2301048, "step": 12070 }, { "epoch": 6.275987525987526, "grad_norm": 0.28933775424957275, "learning_rate": 4.335667720470386e-05, "loss": 0.101, "num_input_tokens_seen": 2301944, "step": 12075 }, { "epoch": 6.278586278586278, "grad_norm": 0.25037914514541626, "learning_rate": 4.3348977581389325e-05, "loss": 0.0783, "num_input_tokens_seen": 2302936, "step": 12080 }, { "epoch": 6.281185031185031, "grad_norm": 0.26870596408843994, "learning_rate": 4.3341274183252854e-05, "loss": 0.0979, "num_input_tokens_seen": 2303832, "step": 12085 }, { "epoch": 6.283783783783784, "grad_norm": 0.6043311357498169, "learning_rate": 4.33335670118792e-05, "loss": 0.1765, "num_input_tokens_seen": 2304824, "step": 12090 }, { "epoch": 6.286382536382536, "grad_norm": 0.6783658862113953, "learning_rate": 4.3325856068853924e-05, "loss": 0.0698, "num_input_tokens_seen": 2305816, "step": 12095 }, { "epoch": 6.288981288981289, "grad_norm": 0.7033111453056335, "learning_rate": 4.331814135576335e-05, "loss": 0.1805, "num_input_tokens_seen": 2306840, "step": 12100 }, { "epoch": 6.291580041580041, "grad_norm": 1.0867570638656616, "learning_rate": 4.331042287419458e-05, "loss": 0.2005, "num_input_tokens_seen": 2307768, "step": 12105 }, { "epoch": 6.294178794178794, "grad_norm": 0.6369320154190063, "learning_rate": 4.3302700625735494e-05, "loss": 0.1412, "num_input_tokens_seen": 2308760, "step": 12110 }, { "epoch": 6.296777546777546, "grad_norm": 0.7026015520095825, "learning_rate": 4.3294974611974725e-05, "loss": 0.2088, "num_input_tokens_seen": 2309656, "step": 12115 }, { "epoch": 6.299376299376299, "grad_norm": 0.3336123824119568, "learning_rate": 4.328724483450172e-05, "loss": 0.0415, "num_input_tokens_seen": 2310584, "step": 12120 }, { "epoch": 6.301975051975052, "grad_norm": 0.47940492630004883, "learning_rate": 4.327951129490666e-05, "loss": 0.0912, "num_input_tokens_seen": 2311480, "step": 12125 }, { "epoch": 6.3045738045738045, "grad_norm": 0.6197492480278015, "learning_rate": 4.327177399478053e-05, "loss": 0.1411, "num_input_tokens_seen": 2312440, "step": 12130 }, { "epoch": 6.307172557172557, "grad_norm": 1.3350598812103271, "learning_rate": 4.326403293571508e-05, "loss": 0.1608, "num_input_tokens_seen": 2313432, "step": 12135 }, { "epoch": 6.3097713097713095, "grad_norm": 0.6668146848678589, "learning_rate": 4.325628811930281e-05, "loss": 0.1166, "num_input_tokens_seen": 2314456, "step": 12140 }, { "epoch": 6.3123700623700625, "grad_norm": 0.9037585854530334, "learning_rate": 4.3248539547137034e-05, "loss": 0.0887, "num_input_tokens_seen": 2315416, "step": 12145 }, { "epoch": 6.314968814968815, "grad_norm": 0.7849273085594177, "learning_rate": 4.324078722081182e-05, "loss": 0.1965, "num_input_tokens_seen": 2316440, "step": 12150 }, { "epoch": 6.3175675675675675, "grad_norm": 0.5215352773666382, "learning_rate": 4.323303114192198e-05, "loss": 0.2035, "num_input_tokens_seen": 2317464, "step": 12155 }, { "epoch": 6.3201663201663205, "grad_norm": 0.5606182813644409, "learning_rate": 4.3225271312063155e-05, "loss": 0.1513, "num_input_tokens_seen": 2318360, "step": 12160 }, { "epoch": 6.322765072765073, "grad_norm": 0.5554373264312744, "learning_rate": 4.3217507732831706e-05, "loss": 0.104, "num_input_tokens_seen": 2319320, "step": 12165 }, { "epoch": 6.325363825363826, "grad_norm": 0.7372795939445496, "learning_rate": 4.320974040582479e-05, "loss": 0.1205, "num_input_tokens_seen": 2320248, "step": 12170 }, { "epoch": 6.327962577962578, "grad_norm": 0.9534460306167603, "learning_rate": 4.320196933264035e-05, "loss": 0.191, "num_input_tokens_seen": 2321208, "step": 12175 }, { "epoch": 6.330561330561331, "grad_norm": 0.2908423840999603, "learning_rate": 4.319419451487705e-05, "loss": 0.1097, "num_input_tokens_seen": 2322136, "step": 12180 }, { "epoch": 6.333160083160083, "grad_norm": 0.218324676156044, "learning_rate": 4.3186415954134386e-05, "loss": 0.0993, "num_input_tokens_seen": 2323160, "step": 12185 }, { "epoch": 6.335758835758836, "grad_norm": 0.16658373177051544, "learning_rate": 4.3178633652012567e-05, "loss": 0.087, "num_input_tokens_seen": 2324120, "step": 12190 }, { "epoch": 6.338357588357589, "grad_norm": 1.4388097524642944, "learning_rate": 4.3170847610112606e-05, "loss": 0.1343, "num_input_tokens_seen": 2325048, "step": 12195 }, { "epoch": 6.340956340956341, "grad_norm": 0.9674620032310486, "learning_rate": 4.316305783003629e-05, "loss": 0.1428, "num_input_tokens_seen": 2326040, "step": 12200 }, { "epoch": 6.343555093555094, "grad_norm": 0.6570072770118713, "learning_rate": 4.3155264313386156e-05, "loss": 0.1047, "num_input_tokens_seen": 2327000, "step": 12205 }, { "epoch": 6.346153846153846, "grad_norm": 1.1113362312316895, "learning_rate": 4.31474670617655e-05, "loss": 0.0954, "num_input_tokens_seen": 2327992, "step": 12210 }, { "epoch": 6.348752598752599, "grad_norm": 1.0615805387496948, "learning_rate": 4.313966607677843e-05, "loss": 0.1837, "num_input_tokens_seen": 2328984, "step": 12215 }, { "epoch": 6.351351351351352, "grad_norm": 1.0761265754699707, "learning_rate": 4.3131861360029775e-05, "loss": 0.2399, "num_input_tokens_seen": 2329976, "step": 12220 }, { "epoch": 6.353950103950104, "grad_norm": 1.142102599143982, "learning_rate": 4.312405291312516e-05, "loss": 0.1435, "num_input_tokens_seen": 2330936, "step": 12225 }, { "epoch": 6.356548856548857, "grad_norm": 0.33351948857307434, "learning_rate": 4.311624073767095e-05, "loss": 0.1717, "num_input_tokens_seen": 2331992, "step": 12230 }, { "epoch": 6.359147609147609, "grad_norm": 0.4989392161369324, "learning_rate": 4.310842483527433e-05, "loss": 0.1959, "num_input_tokens_seen": 2332920, "step": 12235 }, { "epoch": 6.361746361746362, "grad_norm": 1.1787134408950806, "learning_rate": 4.310060520754319e-05, "loss": 0.1631, "num_input_tokens_seen": 2333880, "step": 12240 }, { "epoch": 6.364345114345114, "grad_norm": 0.8153384327888489, "learning_rate": 4.3092781856086215e-05, "loss": 0.1847, "num_input_tokens_seen": 2334840, "step": 12245 }, { "epoch": 6.366943866943867, "grad_norm": 0.8638612627983093, "learning_rate": 4.3084954782512875e-05, "loss": 0.098, "num_input_tokens_seen": 2335768, "step": 12250 }, { "epoch": 6.36954261954262, "grad_norm": 0.2744535505771637, "learning_rate": 4.3077123988433366e-05, "loss": 0.1708, "num_input_tokens_seen": 2336728, "step": 12255 }, { "epoch": 6.372141372141372, "grad_norm": 0.6209651231765747, "learning_rate": 4.3069289475458676e-05, "loss": 0.2233, "num_input_tokens_seen": 2337624, "step": 12260 }, { "epoch": 6.374740124740125, "grad_norm": 1.1545287370681763, "learning_rate": 4.306145124520055e-05, "loss": 0.153, "num_input_tokens_seen": 2338584, "step": 12265 }, { "epoch": 6.377338877338877, "grad_norm": 1.11345636844635, "learning_rate": 4.3053609299271494e-05, "loss": 0.3081, "num_input_tokens_seen": 2339512, "step": 12270 }, { "epoch": 6.37993762993763, "grad_norm": 1.051179051399231, "learning_rate": 4.3045763639284794e-05, "loss": 0.1467, "num_input_tokens_seen": 2340472, "step": 12275 }, { "epoch": 6.382536382536383, "grad_norm": 0.7860257029533386, "learning_rate": 4.3037914266854486e-05, "loss": 0.0875, "num_input_tokens_seen": 2341432, "step": 12280 }, { "epoch": 6.385135135135135, "grad_norm": 0.20920413732528687, "learning_rate": 4.303006118359537e-05, "loss": 0.0572, "num_input_tokens_seen": 2342392, "step": 12285 }, { "epoch": 6.387733887733888, "grad_norm": 1.0024290084838867, "learning_rate": 4.302220439112301e-05, "loss": 0.2242, "num_input_tokens_seen": 2343352, "step": 12290 }, { "epoch": 6.39033264033264, "grad_norm": 1.174593210220337, "learning_rate": 4.3014343891053734e-05, "loss": 0.1535, "num_input_tokens_seen": 2344248, "step": 12295 }, { "epoch": 6.392931392931393, "grad_norm": 0.7900076508522034, "learning_rate": 4.300647968500464e-05, "loss": 0.0661, "num_input_tokens_seen": 2345176, "step": 12300 }, { "epoch": 6.395530145530145, "grad_norm": 0.1889425665140152, "learning_rate": 4.299861177459358e-05, "loss": 0.1097, "num_input_tokens_seen": 2346136, "step": 12305 }, { "epoch": 6.398128898128898, "grad_norm": 0.1765357404947281, "learning_rate": 4.299074016143917e-05, "loss": 0.1367, "num_input_tokens_seen": 2347064, "step": 12310 }, { "epoch": 6.400727650727651, "grad_norm": 0.5551681518554688, "learning_rate": 4.29828648471608e-05, "loss": 0.1111, "num_input_tokens_seen": 2347960, "step": 12315 }, { "epoch": 6.403326403326403, "grad_norm": 1.1158756017684937, "learning_rate": 4.297498583337859e-05, "loss": 0.1436, "num_input_tokens_seen": 2348920, "step": 12320 }, { "epoch": 6.405925155925156, "grad_norm": 0.15749040246009827, "learning_rate": 4.2967103121713456e-05, "loss": 0.0607, "num_input_tokens_seen": 2349880, "step": 12325 }, { "epoch": 6.408523908523908, "grad_norm": 0.48481786251068115, "learning_rate": 4.295921671378706e-05, "loss": 0.1505, "num_input_tokens_seen": 2350904, "step": 12330 }, { "epoch": 6.411122661122661, "grad_norm": 0.27143439650535583, "learning_rate": 4.295132661122181e-05, "loss": 0.1434, "num_input_tokens_seen": 2351864, "step": 12335 }, { "epoch": 6.413721413721413, "grad_norm": 1.189965844154358, "learning_rate": 4.29434328156409e-05, "loss": 0.1215, "num_input_tokens_seen": 2352824, "step": 12340 }, { "epoch": 6.416320166320166, "grad_norm": 0.9162665009498596, "learning_rate": 4.2935535328668263e-05, "loss": 0.2126, "num_input_tokens_seen": 2353816, "step": 12345 }, { "epoch": 6.418918918918919, "grad_norm": 0.15125153958797455, "learning_rate": 4.2927634151928605e-05, "loss": 0.1711, "num_input_tokens_seen": 2354712, "step": 12350 }, { "epoch": 6.421517671517671, "grad_norm": 0.31471365690231323, "learning_rate": 4.291972928704739e-05, "loss": 0.1021, "num_input_tokens_seen": 2355672, "step": 12355 }, { "epoch": 6.424116424116424, "grad_norm": 0.5490149259567261, "learning_rate": 4.2911820735650845e-05, "loss": 0.1913, "num_input_tokens_seen": 2356632, "step": 12360 }, { "epoch": 6.4267151767151764, "grad_norm": 0.449692964553833, "learning_rate": 4.290390849936592e-05, "loss": 0.2026, "num_input_tokens_seen": 2357592, "step": 12365 }, { "epoch": 6.429313929313929, "grad_norm": 0.9188866019248962, "learning_rate": 4.2895992579820373e-05, "loss": 0.1075, "num_input_tokens_seen": 2358552, "step": 12370 }, { "epoch": 6.4319126819126815, "grad_norm": 0.5411826968193054, "learning_rate": 4.288807297864269e-05, "loss": 0.1149, "num_input_tokens_seen": 2359480, "step": 12375 }, { "epoch": 6.4345114345114345, "grad_norm": 0.997243344783783, "learning_rate": 4.288014969746213e-05, "loss": 0.155, "num_input_tokens_seen": 2360472, "step": 12380 }, { "epoch": 6.4371101871101875, "grad_norm": 0.40693777799606323, "learning_rate": 4.287222273790868e-05, "loss": 0.0326, "num_input_tokens_seen": 2361368, "step": 12385 }, { "epoch": 6.4397089397089395, "grad_norm": 0.04598024860024452, "learning_rate": 4.2864292101613136e-05, "loss": 0.0888, "num_input_tokens_seen": 2362328, "step": 12390 }, { "epoch": 6.4423076923076925, "grad_norm": 0.37303873896598816, "learning_rate": 4.285635779020699e-05, "loss": 0.1063, "num_input_tokens_seen": 2363256, "step": 12395 }, { "epoch": 6.444906444906445, "grad_norm": 0.31595927476882935, "learning_rate": 4.284841980532254e-05, "loss": 0.1119, "num_input_tokens_seen": 2364248, "step": 12400 }, { "epoch": 6.447505197505198, "grad_norm": 1.35063636302948, "learning_rate": 4.284047814859279e-05, "loss": 0.2213, "num_input_tokens_seen": 2365208, "step": 12405 }, { "epoch": 6.45010395010395, "grad_norm": 0.5701920986175537, "learning_rate": 4.283253282165156e-05, "loss": 0.1754, "num_input_tokens_seen": 2366168, "step": 12410 }, { "epoch": 6.452702702702703, "grad_norm": 0.09338025748729706, "learning_rate": 4.282458382613338e-05, "loss": 0.1317, "num_input_tokens_seen": 2367160, "step": 12415 }, { "epoch": 6.455301455301456, "grad_norm": 0.5226536989212036, "learning_rate": 4.281663116367355e-05, "loss": 0.1092, "num_input_tokens_seen": 2368088, "step": 12420 }, { "epoch": 6.457900207900208, "grad_norm": 0.3709852993488312, "learning_rate": 4.2808674835908116e-05, "loss": 0.0732, "num_input_tokens_seen": 2369016, "step": 12425 }, { "epoch": 6.460498960498961, "grad_norm": 0.1807837039232254, "learning_rate": 4.280071484447388e-05, "loss": 0.1047, "num_input_tokens_seen": 2370008, "step": 12430 }, { "epoch": 6.463097713097713, "grad_norm": 0.9689968228340149, "learning_rate": 4.279275119100842e-05, "loss": 0.1036, "num_input_tokens_seen": 2370904, "step": 12435 }, { "epoch": 6.465696465696466, "grad_norm": 0.940930962562561, "learning_rate": 4.2784783877150035e-05, "loss": 0.143, "num_input_tokens_seen": 2371960, "step": 12440 }, { "epoch": 6.468295218295219, "grad_norm": 0.1757315695285797, "learning_rate": 4.27768129045378e-05, "loss": 0.0502, "num_input_tokens_seen": 2372856, "step": 12445 }, { "epoch": 6.470893970893971, "grad_norm": 0.7418129444122314, "learning_rate": 4.2768838274811525e-05, "loss": 0.2295, "num_input_tokens_seen": 2373816, "step": 12450 }, { "epoch": 6.473492723492724, "grad_norm": 0.5027737021446228, "learning_rate": 4.276085998961179e-05, "loss": 0.0611, "num_input_tokens_seen": 2374744, "step": 12455 }, { "epoch": 6.476091476091476, "grad_norm": 1.2207921743392944, "learning_rate": 4.275287805057992e-05, "loss": 0.1093, "num_input_tokens_seen": 2375672, "step": 12460 }, { "epoch": 6.478690228690229, "grad_norm": 0.4448271095752716, "learning_rate": 4.2744892459357964e-05, "loss": 0.1866, "num_input_tokens_seen": 2376600, "step": 12465 }, { "epoch": 6.481288981288981, "grad_norm": 0.8947139978408813, "learning_rate": 4.273690321758879e-05, "loss": 0.1174, "num_input_tokens_seen": 2377560, "step": 12470 }, { "epoch": 6.483887733887734, "grad_norm": 0.32350996136665344, "learning_rate": 4.272891032691594e-05, "loss": 0.0222, "num_input_tokens_seen": 2378520, "step": 12475 }, { "epoch": 6.486486486486487, "grad_norm": 1.2370212078094482, "learning_rate": 4.272091378898376e-05, "loss": 0.1915, "num_input_tokens_seen": 2379512, "step": 12480 }, { "epoch": 6.489085239085239, "grad_norm": 0.12237811088562012, "learning_rate": 4.271291360543733e-05, "loss": 0.0823, "num_input_tokens_seen": 2380472, "step": 12485 }, { "epoch": 6.491683991683992, "grad_norm": 0.2318621277809143, "learning_rate": 4.270490977792246e-05, "loss": 0.0742, "num_input_tokens_seen": 2381400, "step": 12490 }, { "epoch": 6.494282744282744, "grad_norm": 0.19110596179962158, "learning_rate": 4.2696902308085764e-05, "loss": 0.0907, "num_input_tokens_seen": 2382424, "step": 12495 }, { "epoch": 6.496881496881497, "grad_norm": 0.18366535007953644, "learning_rate": 4.2688891197574534e-05, "loss": 0.0745, "num_input_tokens_seen": 2383416, "step": 12500 }, { "epoch": 6.49948024948025, "grad_norm": 0.21782736480236053, "learning_rate": 4.268087644803685e-05, "loss": 0.1167, "num_input_tokens_seen": 2384344, "step": 12505 }, { "epoch": 6.502079002079002, "grad_norm": 1.5297285318374634, "learning_rate": 4.267285806112156e-05, "loss": 0.2205, "num_input_tokens_seen": 2385368, "step": 12510 }, { "epoch": 6.504677754677755, "grad_norm": 0.4854603707790375, "learning_rate": 4.266483603847822e-05, "loss": 0.1991, "num_input_tokens_seen": 2386360, "step": 12515 }, { "epoch": 6.507276507276507, "grad_norm": 0.23102106153964996, "learning_rate": 4.265681038175715e-05, "loss": 0.1177, "num_input_tokens_seen": 2387352, "step": 12520 }, { "epoch": 6.50987525987526, "grad_norm": 0.7144346237182617, "learning_rate": 4.2648781092609424e-05, "loss": 0.1549, "num_input_tokens_seen": 2388248, "step": 12525 }, { "epoch": 6.512474012474012, "grad_norm": 0.18156196177005768, "learning_rate": 4.264074817268686e-05, "loss": 0.0755, "num_input_tokens_seen": 2389240, "step": 12530 }, { "epoch": 6.515072765072765, "grad_norm": 0.1488601118326187, "learning_rate": 4.263271162364201e-05, "loss": 0.1861, "num_input_tokens_seen": 2390168, "step": 12535 }, { "epoch": 6.517671517671518, "grad_norm": 0.1857563555240631, "learning_rate": 4.262467144712821e-05, "loss": 0.1282, "num_input_tokens_seen": 2391064, "step": 12540 }, { "epoch": 6.52027027027027, "grad_norm": 1.7137788534164429, "learning_rate": 4.2616627644799476e-05, "loss": 0.2541, "num_input_tokens_seen": 2392024, "step": 12545 }, { "epoch": 6.522869022869023, "grad_norm": 0.26202335953712463, "learning_rate": 4.260858021831065e-05, "loss": 0.158, "num_input_tokens_seen": 2392920, "step": 12550 }, { "epoch": 6.525467775467775, "grad_norm": 0.5488967895507812, "learning_rate": 4.260052916931726e-05, "loss": 0.29, "num_input_tokens_seen": 2393944, "step": 12555 }, { "epoch": 6.528066528066528, "grad_norm": 0.3482745289802551, "learning_rate": 4.259247449947559e-05, "loss": 0.197, "num_input_tokens_seen": 2394904, "step": 12560 }, { "epoch": 6.53066528066528, "grad_norm": 0.2826308310031891, "learning_rate": 4.258441621044268e-05, "loss": 0.0791, "num_input_tokens_seen": 2395832, "step": 12565 }, { "epoch": 6.533264033264033, "grad_norm": 0.6381884217262268, "learning_rate": 4.257635430387633e-05, "loss": 0.1013, "num_input_tokens_seen": 2396760, "step": 12570 }, { "epoch": 6.535862785862786, "grad_norm": 0.11638797074556351, "learning_rate": 4.256828878143506e-05, "loss": 0.131, "num_input_tokens_seen": 2397688, "step": 12575 }, { "epoch": 6.538461538461538, "grad_norm": 0.5199399590492249, "learning_rate": 4.2560219644778134e-05, "loss": 0.2398, "num_input_tokens_seen": 2398712, "step": 12580 }, { "epoch": 6.541060291060291, "grad_norm": 0.3595779240131378, "learning_rate": 4.255214689556557e-05, "loss": 0.101, "num_input_tokens_seen": 2399672, "step": 12585 }, { "epoch": 6.543659043659043, "grad_norm": 0.7325388193130493, "learning_rate": 4.2544070535458115e-05, "loss": 0.1519, "num_input_tokens_seen": 2400568, "step": 12590 }, { "epoch": 6.546257796257796, "grad_norm": 0.6011371612548828, "learning_rate": 4.2535990566117276e-05, "loss": 0.1863, "num_input_tokens_seen": 2401560, "step": 12595 }, { "epoch": 6.548856548856548, "grad_norm": 1.1169263124465942, "learning_rate": 4.25279069892053e-05, "loss": 0.2079, "num_input_tokens_seen": 2402552, "step": 12600 }, { "epoch": 6.551455301455301, "grad_norm": 0.29248514771461487, "learning_rate": 4.251981980638516e-05, "loss": 0.0882, "num_input_tokens_seen": 2403544, "step": 12605 }, { "epoch": 6.554054054054054, "grad_norm": 0.4293139576911926, "learning_rate": 4.25117290193206e-05, "loss": 0.0887, "num_input_tokens_seen": 2404504, "step": 12610 }, { "epoch": 6.5566528066528065, "grad_norm": 0.8404319286346436, "learning_rate": 4.250363462967607e-05, "loss": 0.3224, "num_input_tokens_seen": 2405496, "step": 12615 }, { "epoch": 6.5592515592515594, "grad_norm": 0.7725367546081543, "learning_rate": 4.249553663911679e-05, "loss": 0.0956, "num_input_tokens_seen": 2406456, "step": 12620 }, { "epoch": 6.5618503118503115, "grad_norm": 0.266090989112854, "learning_rate": 4.24874350493087e-05, "loss": 0.0337, "num_input_tokens_seen": 2407384, "step": 12625 }, { "epoch": 6.5644490644490645, "grad_norm": 0.6973559260368347, "learning_rate": 4.24793298619185e-05, "loss": 0.1884, "num_input_tokens_seen": 2408344, "step": 12630 }, { "epoch": 6.567047817047817, "grad_norm": 0.6874150037765503, "learning_rate": 4.247122107861361e-05, "loss": 0.1455, "num_input_tokens_seen": 2409304, "step": 12635 }, { "epoch": 6.56964656964657, "grad_norm": 0.6006804704666138, "learning_rate": 4.2463108701062206e-05, "loss": 0.1047, "num_input_tokens_seen": 2410232, "step": 12640 }, { "epoch": 6.5722453222453225, "grad_norm": 1.2491248846054077, "learning_rate": 4.24549927309332e-05, "loss": 0.1107, "num_input_tokens_seen": 2411192, "step": 12645 }, { "epoch": 6.574844074844075, "grad_norm": 0.2853272557258606, "learning_rate": 4.244687316989624e-05, "loss": 0.0821, "num_input_tokens_seen": 2412120, "step": 12650 }, { "epoch": 6.577442827442828, "grad_norm": 3.025212526321411, "learning_rate": 4.24387500196217e-05, "loss": 0.1884, "num_input_tokens_seen": 2413016, "step": 12655 }, { "epoch": 6.58004158004158, "grad_norm": 1.2589077949523926, "learning_rate": 4.2430623281780726e-05, "loss": 0.3225, "num_input_tokens_seen": 2413944, "step": 12660 }, { "epoch": 6.582640332640333, "grad_norm": 0.14834614098072052, "learning_rate": 4.242249295804517e-05, "loss": 0.1582, "num_input_tokens_seen": 2414904, "step": 12665 }, { "epoch": 6.585239085239085, "grad_norm": 1.2303091287612915, "learning_rate": 4.241435905008763e-05, "loss": 0.2632, "num_input_tokens_seen": 2415896, "step": 12670 }, { "epoch": 6.587837837837838, "grad_norm": 0.17002135515213013, "learning_rate": 4.240622155958145e-05, "loss": 0.1121, "num_input_tokens_seen": 2416856, "step": 12675 }, { "epoch": 6.590436590436591, "grad_norm": 0.24964407086372375, "learning_rate": 4.2398080488200716e-05, "loss": 0.1107, "num_input_tokens_seen": 2417816, "step": 12680 }, { "epoch": 6.593035343035343, "grad_norm": 0.13645730912685394, "learning_rate": 4.2389935837620224e-05, "loss": 0.1079, "num_input_tokens_seen": 2418808, "step": 12685 }, { "epoch": 6.595634095634096, "grad_norm": 0.1300186812877655, "learning_rate": 4.238178760951552e-05, "loss": 0.0946, "num_input_tokens_seen": 2419704, "step": 12690 }, { "epoch": 6.598232848232849, "grad_norm": 0.9352328777313232, "learning_rate": 4.2373635805562905e-05, "loss": 0.155, "num_input_tokens_seen": 2420728, "step": 12695 }, { "epoch": 6.600831600831601, "grad_norm": 0.14858481287956238, "learning_rate": 4.23654804274394e-05, "loss": 0.133, "num_input_tokens_seen": 2421624, "step": 12700 }, { "epoch": 6.603430353430354, "grad_norm": 0.3075921833515167, "learning_rate": 4.235732147682274e-05, "loss": 0.0965, "num_input_tokens_seen": 2422520, "step": 12705 }, { "epoch": 6.606029106029106, "grad_norm": 0.25550511479377747, "learning_rate": 4.2349158955391436e-05, "loss": 0.0967, "num_input_tokens_seen": 2423480, "step": 12710 }, { "epoch": 6.608627858627859, "grad_norm": 1.0576282739639282, "learning_rate": 4.2340992864824696e-05, "loss": 0.2793, "num_input_tokens_seen": 2424472, "step": 12715 }, { "epoch": 6.611226611226611, "grad_norm": 0.16673892736434937, "learning_rate": 4.2332823206802496e-05, "loss": 0.1088, "num_input_tokens_seen": 2425464, "step": 12720 }, { "epoch": 6.613825363825364, "grad_norm": 1.2095497846603394, "learning_rate": 4.232464998300552e-05, "loss": 0.1115, "num_input_tokens_seen": 2426392, "step": 12725 }, { "epoch": 6.616424116424117, "grad_norm": 0.6323404312133789, "learning_rate": 4.231647319511519e-05, "loss": 0.142, "num_input_tokens_seen": 2427320, "step": 12730 }, { "epoch": 6.619022869022869, "grad_norm": 0.1963091492652893, "learning_rate": 4.230829284481368e-05, "loss": 0.0926, "num_input_tokens_seen": 2428216, "step": 12735 }, { "epoch": 6.621621621621622, "grad_norm": 0.5662091970443726, "learning_rate": 4.2300108933783867e-05, "loss": 0.1047, "num_input_tokens_seen": 2429144, "step": 12740 }, { "epoch": 6.624220374220374, "grad_norm": 0.15953847765922546, "learning_rate": 4.229192146370939e-05, "loss": 0.2147, "num_input_tokens_seen": 2429976, "step": 12745 }, { "epoch": 6.626819126819127, "grad_norm": 1.0883666276931763, "learning_rate": 4.2283730436274604e-05, "loss": 0.2202, "num_input_tokens_seen": 2430904, "step": 12750 }, { "epoch": 6.629417879417879, "grad_norm": 0.4470324218273163, "learning_rate": 4.22755358531646e-05, "loss": 0.1311, "num_input_tokens_seen": 2431864, "step": 12755 }, { "epoch": 6.632016632016632, "grad_norm": 0.13098083436489105, "learning_rate": 4.226733771606519e-05, "loss": 0.1067, "num_input_tokens_seen": 2432920, "step": 12760 }, { "epoch": 6.634615384615385, "grad_norm": 0.5378458499908447, "learning_rate": 4.225913602666293e-05, "loss": 0.1389, "num_input_tokens_seen": 2433848, "step": 12765 }, { "epoch": 6.637214137214137, "grad_norm": 0.7883696556091309, "learning_rate": 4.225093078664511e-05, "loss": 0.2019, "num_input_tokens_seen": 2434744, "step": 12770 }, { "epoch": 6.63981288981289, "grad_norm": 1.011995792388916, "learning_rate": 4.224272199769973e-05, "loss": 0.1741, "num_input_tokens_seen": 2435672, "step": 12775 }, { "epoch": 6.642411642411642, "grad_norm": 0.179421529173851, "learning_rate": 4.2234509661515556e-05, "loss": 0.1806, "num_input_tokens_seen": 2436600, "step": 12780 }, { "epoch": 6.645010395010395, "grad_norm": 0.9468803405761719, "learning_rate": 4.2226293779782035e-05, "loss": 0.1594, "num_input_tokens_seen": 2437560, "step": 12785 }, { "epoch": 6.647609147609147, "grad_norm": 0.29007554054260254, "learning_rate": 4.2218074354189385e-05, "loss": 0.0918, "num_input_tokens_seen": 2438520, "step": 12790 }, { "epoch": 6.6502079002079, "grad_norm": 0.9329654574394226, "learning_rate": 4.2209851386428536e-05, "loss": 0.1258, "num_input_tokens_seen": 2439416, "step": 12795 }, { "epoch": 6.652806652806653, "grad_norm": 0.8370631337165833, "learning_rate": 4.220162487819114e-05, "loss": 0.1904, "num_input_tokens_seen": 2440376, "step": 12800 }, { "epoch": 6.655405405405405, "grad_norm": 0.6346704959869385, "learning_rate": 4.21933948311696e-05, "loss": 0.187, "num_input_tokens_seen": 2441272, "step": 12805 }, { "epoch": 6.658004158004158, "grad_norm": 0.712780237197876, "learning_rate": 4.218516124705702e-05, "loss": 0.1536, "num_input_tokens_seen": 2442200, "step": 12810 }, { "epoch": 6.66060291060291, "grad_norm": 0.05916266888380051, "learning_rate": 4.217692412754724e-05, "loss": 0.0656, "num_input_tokens_seen": 2443064, "step": 12815 }, { "epoch": 6.663201663201663, "grad_norm": 0.5159199833869934, "learning_rate": 4.216868347433486e-05, "loss": 0.1189, "num_input_tokens_seen": 2444024, "step": 12820 }, { "epoch": 6.665800415800415, "grad_norm": 1.219241976737976, "learning_rate": 4.216043928911515e-05, "loss": 0.1506, "num_input_tokens_seen": 2444920, "step": 12825 }, { "epoch": 6.668399168399168, "grad_norm": 2.7302751541137695, "learning_rate": 4.215219157358416e-05, "loss": 0.188, "num_input_tokens_seen": 2445752, "step": 12830 }, { "epoch": 6.670997920997921, "grad_norm": 1.092903971672058, "learning_rate": 4.21439403294386e-05, "loss": 0.276, "num_input_tokens_seen": 2446744, "step": 12835 }, { "epoch": 6.673596673596673, "grad_norm": 0.29161664843559265, "learning_rate": 4.2135685558375994e-05, "loss": 0.1355, "num_input_tokens_seen": 2447736, "step": 12840 }, { "epoch": 6.676195426195426, "grad_norm": 1.3925632238388062, "learning_rate": 4.2127427262094525e-05, "loss": 0.108, "num_input_tokens_seen": 2448728, "step": 12845 }, { "epoch": 6.6787941787941785, "grad_norm": 0.18913070857524872, "learning_rate": 4.211916544229312e-05, "loss": 0.0912, "num_input_tokens_seen": 2449656, "step": 12850 }, { "epoch": 6.6813929313929314, "grad_norm": 0.583928108215332, "learning_rate": 4.2110900100671435e-05, "loss": 0.1064, "num_input_tokens_seen": 2450648, "step": 12855 }, { "epoch": 6.6839916839916835, "grad_norm": 0.7111544013023376, "learning_rate": 4.2102631238929845e-05, "loss": 0.1331, "num_input_tokens_seen": 2451672, "step": 12860 }, { "epoch": 6.6865904365904365, "grad_norm": 0.561428427696228, "learning_rate": 4.209435885876944e-05, "loss": 0.1037, "num_input_tokens_seen": 2452600, "step": 12865 }, { "epoch": 6.6891891891891895, "grad_norm": 0.6436538100242615, "learning_rate": 4.208608296189208e-05, "loss": 0.1614, "num_input_tokens_seen": 2453624, "step": 12870 }, { "epoch": 6.691787941787942, "grad_norm": 1.423701286315918, "learning_rate": 4.2077803550000286e-05, "loss": 0.2488, "num_input_tokens_seen": 2454552, "step": 12875 }, { "epoch": 6.6943866943866945, "grad_norm": 0.38918131589889526, "learning_rate": 4.206952062479735e-05, "loss": 0.1312, "num_input_tokens_seen": 2455448, "step": 12880 }, { "epoch": 6.696985446985447, "grad_norm": 0.781156599521637, "learning_rate": 4.206123418798724e-05, "loss": 0.262, "num_input_tokens_seen": 2456344, "step": 12885 }, { "epoch": 6.6995841995842, "grad_norm": 0.9381308555603027, "learning_rate": 4.205294424127469e-05, "loss": 0.1646, "num_input_tokens_seen": 2457272, "step": 12890 }, { "epoch": 6.702182952182953, "grad_norm": 0.9728926420211792, "learning_rate": 4.2044650786365145e-05, "loss": 0.1273, "num_input_tokens_seen": 2458168, "step": 12895 }, { "epoch": 6.704781704781705, "grad_norm": 0.615874171257019, "learning_rate": 4.203635382496476e-05, "loss": 0.118, "num_input_tokens_seen": 2459128, "step": 12900 }, { "epoch": 6.707380457380458, "grad_norm": 0.6363153457641602, "learning_rate": 4.202805335878042e-05, "loss": 0.1239, "num_input_tokens_seen": 2460088, "step": 12905 }, { "epoch": 6.70997920997921, "grad_norm": 0.38018256425857544, "learning_rate": 4.2019749389519716e-05, "loss": 0.1324, "num_input_tokens_seen": 2461016, "step": 12910 }, { "epoch": 6.712577962577963, "grad_norm": 0.6561496257781982, "learning_rate": 4.201144191889099e-05, "loss": 0.1323, "num_input_tokens_seen": 2461976, "step": 12915 }, { "epoch": 6.715176715176716, "grad_norm": 0.16597211360931396, "learning_rate": 4.200313094860328e-05, "loss": 0.1741, "num_input_tokens_seen": 2463000, "step": 12920 }, { "epoch": 6.717775467775468, "grad_norm": 0.38952264189720154, "learning_rate": 4.199481648036634e-05, "loss": 0.1092, "num_input_tokens_seen": 2464056, "step": 12925 }, { "epoch": 6.720374220374221, "grad_norm": 0.5891512632369995, "learning_rate": 4.198649851589067e-05, "loss": 0.1039, "num_input_tokens_seen": 2464984, "step": 12930 }, { "epoch": 6.722972972972973, "grad_norm": 1.0766749382019043, "learning_rate": 4.1978177056887466e-05, "loss": 0.1724, "num_input_tokens_seen": 2466040, "step": 12935 }, { "epoch": 6.725571725571726, "grad_norm": 1.0487608909606934, "learning_rate": 4.196985210506865e-05, "loss": 0.1789, "num_input_tokens_seen": 2467000, "step": 12940 }, { "epoch": 6.728170478170478, "grad_norm": 0.16283540427684784, "learning_rate": 4.1961523662146876e-05, "loss": 0.1678, "num_input_tokens_seen": 2467928, "step": 12945 }, { "epoch": 6.730769230769231, "grad_norm": 0.34178996086120605, "learning_rate": 4.195319172983548e-05, "loss": 0.1536, "num_input_tokens_seen": 2468888, "step": 12950 }, { "epoch": 6.733367983367984, "grad_norm": 0.960452675819397, "learning_rate": 4.1944856309848554e-05, "loss": 0.1324, "num_input_tokens_seen": 2469880, "step": 12955 }, { "epoch": 6.735966735966736, "grad_norm": 0.8534839749336243, "learning_rate": 4.193651740390089e-05, "loss": 0.1091, "num_input_tokens_seen": 2470840, "step": 12960 }, { "epoch": 6.738565488565489, "grad_norm": 0.24491606652736664, "learning_rate": 4.1928175013708e-05, "loss": 0.1869, "num_input_tokens_seen": 2471832, "step": 12965 }, { "epoch": 6.741164241164241, "grad_norm": 0.972752034664154, "learning_rate": 4.191982914098611e-05, "loss": 0.1845, "num_input_tokens_seen": 2472792, "step": 12970 }, { "epoch": 6.743762993762994, "grad_norm": 0.8215354681015015, "learning_rate": 4.191147978745218e-05, "loss": 0.3135, "num_input_tokens_seen": 2473784, "step": 12975 }, { "epoch": 6.746361746361746, "grad_norm": 0.7048352956771851, "learning_rate": 4.1903126954823846e-05, "loss": 0.1546, "num_input_tokens_seen": 2474776, "step": 12980 }, { "epoch": 6.748960498960499, "grad_norm": 0.669693648815155, "learning_rate": 4.189477064481949e-05, "loss": 0.1518, "num_input_tokens_seen": 2475736, "step": 12985 }, { "epoch": 6.751559251559252, "grad_norm": 0.6371654272079468, "learning_rate": 4.188641085915822e-05, "loss": 0.139, "num_input_tokens_seen": 2476728, "step": 12990 }, { "epoch": 6.754158004158004, "grad_norm": 0.9540055394172668, "learning_rate": 4.1878047599559834e-05, "loss": 0.1572, "num_input_tokens_seen": 2477656, "step": 12995 }, { "epoch": 6.756756756756757, "grad_norm": 0.1346551775932312, "learning_rate": 4.186968086774484e-05, "loss": 0.1866, "num_input_tokens_seen": 2478616, "step": 13000 }, { "epoch": 6.759355509355509, "grad_norm": 1.2166634798049927, "learning_rate": 4.1861310665434504e-05, "loss": 0.1109, "num_input_tokens_seen": 2479512, "step": 13005 }, { "epoch": 6.761954261954262, "grad_norm": 0.9809253811836243, "learning_rate": 4.185293699435075e-05, "loss": 0.2378, "num_input_tokens_seen": 2480376, "step": 13010 }, { "epoch": 6.764553014553014, "grad_norm": 1.1090859174728394, "learning_rate": 4.184455985621626e-05, "loss": 0.2466, "num_input_tokens_seen": 2481336, "step": 13015 }, { "epoch": 6.767151767151767, "grad_norm": 0.9420390725135803, "learning_rate": 4.18361792527544e-05, "loss": 0.1538, "num_input_tokens_seen": 2482264, "step": 13020 }, { "epoch": 6.76975051975052, "grad_norm": 0.3206945061683655, "learning_rate": 4.182779518568926e-05, "loss": 0.0781, "num_input_tokens_seen": 2483224, "step": 13025 }, { "epoch": 6.772349272349272, "grad_norm": 0.8138172030448914, "learning_rate": 4.181940765674564e-05, "loss": 0.0606, "num_input_tokens_seen": 2484184, "step": 13030 }, { "epoch": 6.774948024948025, "grad_norm": 0.15689414739608765, "learning_rate": 4.181101666764906e-05, "loss": 0.0382, "num_input_tokens_seen": 2485144, "step": 13035 }, { "epoch": 6.777546777546777, "grad_norm": 0.20181749761104584, "learning_rate": 4.1802622220125756e-05, "loss": 0.1201, "num_input_tokens_seen": 2486104, "step": 13040 }, { "epoch": 6.78014553014553, "grad_norm": 0.38477155566215515, "learning_rate": 4.1794224315902645e-05, "loss": 0.1048, "num_input_tokens_seen": 2487096, "step": 13045 }, { "epoch": 6.782744282744282, "grad_norm": 0.27712684869766235, "learning_rate": 4.1785822956707386e-05, "loss": 0.0726, "num_input_tokens_seen": 2487992, "step": 13050 }, { "epoch": 6.785343035343035, "grad_norm": 0.5545973777770996, "learning_rate": 4.1777418144268346e-05, "loss": 0.1423, "num_input_tokens_seen": 2489016, "step": 13055 }, { "epoch": 6.787941787941788, "grad_norm": 0.36489391326904297, "learning_rate": 4.1769009880314583e-05, "loss": 0.1222, "num_input_tokens_seen": 2489976, "step": 13060 }, { "epoch": 6.79054054054054, "grad_norm": 1.0001052618026733, "learning_rate": 4.176059816657588e-05, "loss": 0.2488, "num_input_tokens_seen": 2490872, "step": 13065 }, { "epoch": 6.793139293139293, "grad_norm": 0.4692732095718384, "learning_rate": 4.175218300478274e-05, "loss": 0.118, "num_input_tokens_seen": 2491800, "step": 13070 }, { "epoch": 6.795738045738045, "grad_norm": 0.2468710094690323, "learning_rate": 4.174376439666634e-05, "loss": 0.1448, "num_input_tokens_seen": 2492792, "step": 13075 }, { "epoch": 6.798336798336798, "grad_norm": 0.04202817380428314, "learning_rate": 4.173534234395861e-05, "loss": 0.1255, "num_input_tokens_seen": 2493688, "step": 13080 }, { "epoch": 6.8009355509355505, "grad_norm": 1.0526487827301025, "learning_rate": 4.172691684839215e-05, "loss": 0.1648, "num_input_tokens_seen": 2494648, "step": 13085 }, { "epoch": 6.803534303534303, "grad_norm": 0.21898511052131653, "learning_rate": 4.1718487911700296e-05, "loss": 0.0984, "num_input_tokens_seen": 2495608, "step": 13090 }, { "epoch": 6.806133056133056, "grad_norm": 0.8652610778808594, "learning_rate": 4.171005553561708e-05, "loss": 0.2611, "num_input_tokens_seen": 2496568, "step": 13095 }, { "epoch": 6.8087318087318085, "grad_norm": 0.6691230535507202, "learning_rate": 4.1701619721877235e-05, "loss": 0.219, "num_input_tokens_seen": 2497560, "step": 13100 }, { "epoch": 6.8113305613305615, "grad_norm": 0.7390992641448975, "learning_rate": 4.169318047221621e-05, "loss": 0.0813, "num_input_tokens_seen": 2498488, "step": 13105 }, { "epoch": 6.813929313929314, "grad_norm": 0.5382958650588989, "learning_rate": 4.1684737788370175e-05, "loss": 0.1266, "num_input_tokens_seen": 2499384, "step": 13110 }, { "epoch": 6.8165280665280665, "grad_norm": 1.567931056022644, "learning_rate": 4.1676291672075976e-05, "loss": 0.218, "num_input_tokens_seen": 2500376, "step": 13115 }, { "epoch": 6.8191268191268195, "grad_norm": 0.6852501630783081, "learning_rate": 4.166784212507118e-05, "loss": 0.1248, "num_input_tokens_seen": 2501336, "step": 13120 }, { "epoch": 6.821725571725572, "grad_norm": 0.20073091983795166, "learning_rate": 4.165938914909407e-05, "loss": 0.2451, "num_input_tokens_seen": 2502264, "step": 13125 }, { "epoch": 6.824324324324325, "grad_norm": 1.0892996788024902, "learning_rate": 4.165093274588362e-05, "loss": 0.1867, "num_input_tokens_seen": 2503160, "step": 13130 }, { "epoch": 6.826923076923077, "grad_norm": 1.3536661863327026, "learning_rate": 4.164247291717951e-05, "loss": 0.2723, "num_input_tokens_seen": 2504152, "step": 13135 }, { "epoch": 6.82952182952183, "grad_norm": 0.23954910039901733, "learning_rate": 4.163400966472213e-05, "loss": 0.1133, "num_input_tokens_seen": 2505208, "step": 13140 }, { "epoch": 6.832120582120583, "grad_norm": 0.6185956001281738, "learning_rate": 4.1625542990252576e-05, "loss": 0.1064, "num_input_tokens_seen": 2506136, "step": 13145 }, { "epoch": 6.834719334719335, "grad_norm": 1.468801736831665, "learning_rate": 4.161707289551264e-05, "loss": 0.1954, "num_input_tokens_seen": 2507128, "step": 13150 }, { "epoch": 6.837318087318088, "grad_norm": 0.18628118932247162, "learning_rate": 4.160859938224483e-05, "loss": 0.1231, "num_input_tokens_seen": 2508248, "step": 13155 }, { "epoch": 6.83991683991684, "grad_norm": 0.958967924118042, "learning_rate": 4.160012245219234e-05, "loss": 0.1724, "num_input_tokens_seen": 2509208, "step": 13160 }, { "epoch": 6.842515592515593, "grad_norm": 0.9630007147789001, "learning_rate": 4.159164210709908e-05, "loss": 0.1289, "num_input_tokens_seen": 2510136, "step": 13165 }, { "epoch": 6.845114345114345, "grad_norm": 0.23647700250148773, "learning_rate": 4.1583158348709665e-05, "loss": 0.071, "num_input_tokens_seen": 2511032, "step": 13170 }, { "epoch": 6.847713097713098, "grad_norm": 0.28763267397880554, "learning_rate": 4.157467117876941e-05, "loss": 0.0783, "num_input_tokens_seen": 2511960, "step": 13175 }, { "epoch": 6.850311850311851, "grad_norm": 0.7119023203849792, "learning_rate": 4.156618059902431e-05, "loss": 0.2208, "num_input_tokens_seen": 2512888, "step": 13180 }, { "epoch": 6.852910602910603, "grad_norm": 1.1050554513931274, "learning_rate": 4.155768661122108e-05, "loss": 0.1304, "num_input_tokens_seen": 2513880, "step": 13185 }, { "epoch": 6.855509355509356, "grad_norm": 0.7930414080619812, "learning_rate": 4.154918921710716e-05, "loss": 0.1707, "num_input_tokens_seen": 2514936, "step": 13190 }, { "epoch": 6.858108108108108, "grad_norm": 1.1952714920043945, "learning_rate": 4.154068841843064e-05, "loss": 0.129, "num_input_tokens_seen": 2515864, "step": 13195 }, { "epoch": 6.860706860706861, "grad_norm": 1.0344345569610596, "learning_rate": 4.153218421694035e-05, "loss": 0.291, "num_input_tokens_seen": 2516824, "step": 13200 }, { "epoch": 6.863305613305613, "grad_norm": 0.1806076616048813, "learning_rate": 4.152367661438581e-05, "loss": 0.2286, "num_input_tokens_seen": 2517816, "step": 13205 }, { "epoch": 6.865904365904366, "grad_norm": 1.0438252687454224, "learning_rate": 4.1515165612517235e-05, "loss": 0.2329, "num_input_tokens_seen": 2518776, "step": 13210 }, { "epoch": 6.868503118503119, "grad_norm": 1.0467321872711182, "learning_rate": 4.150665121308554e-05, "loss": 0.2196, "num_input_tokens_seen": 2519736, "step": 13215 }, { "epoch": 6.871101871101871, "grad_norm": 0.32738015055656433, "learning_rate": 4.149813341784232e-05, "loss": 0.127, "num_input_tokens_seen": 2520664, "step": 13220 }, { "epoch": 6.873700623700624, "grad_norm": 0.2550418972969055, "learning_rate": 4.148961222853992e-05, "loss": 0.1939, "num_input_tokens_seen": 2521720, "step": 13225 }, { "epoch": 6.876299376299376, "grad_norm": 0.3412472903728485, "learning_rate": 4.148108764693133e-05, "loss": 0.1747, "num_input_tokens_seen": 2522680, "step": 13230 }, { "epoch": 6.878898128898129, "grad_norm": 0.1921272873878479, "learning_rate": 4.147255967477027e-05, "loss": 0.0651, "num_input_tokens_seen": 2523608, "step": 13235 }, { "epoch": 6.881496881496881, "grad_norm": 0.5877173542976379, "learning_rate": 4.146402831381115e-05, "loss": 0.1411, "num_input_tokens_seen": 2524536, "step": 13240 }, { "epoch": 6.884095634095634, "grad_norm": 0.45156803727149963, "learning_rate": 4.145549356580906e-05, "loss": 0.1329, "num_input_tokens_seen": 2525464, "step": 13245 }, { "epoch": 6.886694386694387, "grad_norm": 0.694218635559082, "learning_rate": 4.144695543251982e-05, "loss": 0.0776, "num_input_tokens_seen": 2526424, "step": 13250 }, { "epoch": 6.889293139293139, "grad_norm": 0.14234676957130432, "learning_rate": 4.1438413915699904e-05, "loss": 0.1391, "num_input_tokens_seen": 2527416, "step": 13255 }, { "epoch": 6.891891891891892, "grad_norm": 0.28576356172561646, "learning_rate": 4.1429869017106525e-05, "loss": 0.1222, "num_input_tokens_seen": 2528376, "step": 13260 }, { "epoch": 6.894490644490644, "grad_norm": 0.44404298067092896, "learning_rate": 4.1421320738497565e-05, "loss": 0.1779, "num_input_tokens_seen": 2529336, "step": 13265 }, { "epoch": 6.897089397089397, "grad_norm": 0.21977536380290985, "learning_rate": 4.141276908163162e-05, "loss": 0.1225, "num_input_tokens_seen": 2530264, "step": 13270 }, { "epoch": 6.899688149688149, "grad_norm": 0.2791031301021576, "learning_rate": 4.140421404826794e-05, "loss": 0.1578, "num_input_tokens_seen": 2531224, "step": 13275 }, { "epoch": 6.902286902286902, "grad_norm": 0.8172144889831543, "learning_rate": 4.1395655640166533e-05, "loss": 0.1105, "num_input_tokens_seen": 2532280, "step": 13280 }, { "epoch": 6.904885654885655, "grad_norm": 0.20997896790504456, "learning_rate": 4.1387093859088045e-05, "loss": 0.0463, "num_input_tokens_seen": 2533240, "step": 13285 }, { "epoch": 6.907484407484407, "grad_norm": 1.6683634519577026, "learning_rate": 4.137852870679385e-05, "loss": 0.2293, "num_input_tokens_seen": 2534232, "step": 13290 }, { "epoch": 6.91008316008316, "grad_norm": 1.1953328847885132, "learning_rate": 4.1369960185046e-05, "loss": 0.1342, "num_input_tokens_seen": 2535192, "step": 13295 }, { "epoch": 6.912681912681912, "grad_norm": 1.4826910495758057, "learning_rate": 4.136138829560724e-05, "loss": 0.1408, "num_input_tokens_seen": 2536120, "step": 13300 }, { "epoch": 6.915280665280665, "grad_norm": 0.549210250377655, "learning_rate": 4.135281304024102e-05, "loss": 0.1162, "num_input_tokens_seen": 2537048, "step": 13305 }, { "epoch": 6.917879417879417, "grad_norm": 0.6109737753868103, "learning_rate": 4.1344234420711476e-05, "loss": 0.1853, "num_input_tokens_seen": 2538072, "step": 13310 }, { "epoch": 6.92047817047817, "grad_norm": 0.7675170302391052, "learning_rate": 4.133565243878342e-05, "loss": 0.0782, "num_input_tokens_seen": 2539064, "step": 13315 }, { "epoch": 6.923076923076923, "grad_norm": 0.8867676258087158, "learning_rate": 4.132706709622238e-05, "loss": 0.0877, "num_input_tokens_seen": 2539992, "step": 13320 }, { "epoch": 6.925675675675675, "grad_norm": 0.5435839295387268, "learning_rate": 4.131847839479457e-05, "loss": 0.0811, "num_input_tokens_seen": 2540952, "step": 13325 }, { "epoch": 6.928274428274428, "grad_norm": 0.6520805358886719, "learning_rate": 4.130988633626688e-05, "loss": 0.1439, "num_input_tokens_seen": 2541912, "step": 13330 }, { "epoch": 6.9308731808731805, "grad_norm": 0.728545606136322, "learning_rate": 4.130129092240691e-05, "loss": 0.3222, "num_input_tokens_seen": 2542872, "step": 13335 }, { "epoch": 6.9334719334719335, "grad_norm": 0.5388525128364563, "learning_rate": 4.1292692154982936e-05, "loss": 0.2035, "num_input_tokens_seen": 2543864, "step": 13340 }, { "epoch": 6.936070686070686, "grad_norm": 0.6674851179122925, "learning_rate": 4.128409003576393e-05, "loss": 0.1891, "num_input_tokens_seen": 2544792, "step": 13345 }, { "epoch": 6.9386694386694385, "grad_norm": 0.33253014087677, "learning_rate": 4.127548456651956e-05, "loss": 0.0906, "num_input_tokens_seen": 2545752, "step": 13350 }, { "epoch": 6.9412681912681915, "grad_norm": 0.6414290070533752, "learning_rate": 4.126687574902017e-05, "loss": 0.1617, "num_input_tokens_seen": 2546680, "step": 13355 }, { "epoch": 6.943866943866944, "grad_norm": 1.0278971195220947, "learning_rate": 4.1258263585036797e-05, "loss": 0.1786, "num_input_tokens_seen": 2547576, "step": 13360 }, { "epoch": 6.946465696465697, "grad_norm": 0.19833745062351227, "learning_rate": 4.1249648076341165e-05, "loss": 0.1031, "num_input_tokens_seen": 2548568, "step": 13365 }, { "epoch": 6.9490644490644495, "grad_norm": 0.3686937391757965, "learning_rate": 4.1241029224705704e-05, "loss": 0.0832, "num_input_tokens_seen": 2549464, "step": 13370 }, { "epoch": 6.951663201663202, "grad_norm": 1.2198868989944458, "learning_rate": 4.123240703190351e-05, "loss": 0.1655, "num_input_tokens_seen": 2550392, "step": 13375 }, { "epoch": 6.954261954261955, "grad_norm": 0.8653234243392944, "learning_rate": 4.122378149970836e-05, "loss": 0.1974, "num_input_tokens_seen": 2551352, "step": 13380 }, { "epoch": 6.956860706860707, "grad_norm": 0.6134162545204163, "learning_rate": 4.121515262989475e-05, "loss": 0.1313, "num_input_tokens_seen": 2552376, "step": 13385 }, { "epoch": 6.95945945945946, "grad_norm": 0.1031472384929657, "learning_rate": 4.1206520424237835e-05, "loss": 0.1645, "num_input_tokens_seen": 2553368, "step": 13390 }, { "epoch": 6.962058212058212, "grad_norm": 0.20036552846431732, "learning_rate": 4.1197884884513474e-05, "loss": 0.0481, "num_input_tokens_seen": 2554392, "step": 13395 }, { "epoch": 6.964656964656965, "grad_norm": 0.9360681176185608, "learning_rate": 4.1189246012498196e-05, "loss": 0.1023, "num_input_tokens_seen": 2555352, "step": 13400 }, { "epoch": 6.967255717255718, "grad_norm": 0.3058229982852936, "learning_rate": 4.118060380996922e-05, "loss": 0.1219, "num_input_tokens_seen": 2556312, "step": 13405 }, { "epoch": 6.96985446985447, "grad_norm": 0.9229124784469604, "learning_rate": 4.117195827870446e-05, "loss": 0.192, "num_input_tokens_seen": 2557176, "step": 13410 }, { "epoch": 6.972453222453223, "grad_norm": 1.3244298696517944, "learning_rate": 4.11633094204825e-05, "loss": 0.2319, "num_input_tokens_seen": 2558104, "step": 13415 }, { "epoch": 6.975051975051975, "grad_norm": 0.1405150145292282, "learning_rate": 4.1154657237082614e-05, "loss": 0.1572, "num_input_tokens_seen": 2559064, "step": 13420 }, { "epoch": 6.977650727650728, "grad_norm": 0.3846433758735657, "learning_rate": 4.114600173028478e-05, "loss": 0.2173, "num_input_tokens_seen": 2559896, "step": 13425 }, { "epoch": 6.98024948024948, "grad_norm": 0.47750163078308105, "learning_rate": 4.1137342901869635e-05, "loss": 0.2936, "num_input_tokens_seen": 2560888, "step": 13430 }, { "epoch": 6.982848232848233, "grad_norm": 0.2389918863773346, "learning_rate": 4.11286807536185e-05, "loss": 0.1177, "num_input_tokens_seen": 2561816, "step": 13435 }, { "epoch": 6.985446985446986, "grad_norm": 0.22413504123687744, "learning_rate": 4.1120015287313386e-05, "loss": 0.1211, "num_input_tokens_seen": 2562776, "step": 13440 }, { "epoch": 6.988045738045738, "grad_norm": 0.40419715642929077, "learning_rate": 4.1111346504736985e-05, "loss": 0.1244, "num_input_tokens_seen": 2563704, "step": 13445 }, { "epoch": 6.990644490644491, "grad_norm": 1.140383243560791, "learning_rate": 4.110267440767268e-05, "loss": 0.1653, "num_input_tokens_seen": 2564600, "step": 13450 }, { "epoch": 6.993243243243243, "grad_norm": 0.2291990965604782, "learning_rate": 4.109399899790452e-05, "loss": 0.1138, "num_input_tokens_seen": 2565560, "step": 13455 }, { "epoch": 6.995841995841996, "grad_norm": 0.6089428067207336, "learning_rate": 4.108532027721724e-05, "loss": 0.0941, "num_input_tokens_seen": 2566552, "step": 13460 }, { "epoch": 6.998440748440748, "grad_norm": 0.595854640007019, "learning_rate": 4.1076638247396284e-05, "loss": 0.1069, "num_input_tokens_seen": 2567512, "step": 13465 }, { "epoch": 7.0, "eval_loss": 0.14862456917762756, "eval_runtime": 8.0157, "eval_samples_per_second": 106.791, "eval_steps_per_second": 26.698, "num_input_tokens_seen": 2568040, "step": 13468 }, { "epoch": 7.001039501039501, "grad_norm": 0.3600076735019684, "learning_rate": 4.106795291022773e-05, "loss": 0.089, "num_input_tokens_seen": 2568424, "step": 13470 }, { "epoch": 7.003638253638254, "grad_norm": 1.1922191381454468, "learning_rate": 4.1059264267498355e-05, "loss": 0.2276, "num_input_tokens_seen": 2569320, "step": 13475 }, { "epoch": 7.006237006237006, "grad_norm": 0.2520451545715332, "learning_rate": 4.105057232099563e-05, "loss": 0.108, "num_input_tokens_seen": 2570280, "step": 13480 }, { "epoch": 7.008835758835759, "grad_norm": 0.7506983876228333, "learning_rate": 4.1041877072507695e-05, "loss": 0.2163, "num_input_tokens_seen": 2571144, "step": 13485 }, { "epoch": 7.011434511434511, "grad_norm": 0.862585723400116, "learning_rate": 4.103317852382337e-05, "loss": 0.1471, "num_input_tokens_seen": 2572136, "step": 13490 }, { "epoch": 7.014033264033264, "grad_norm": 1.2066926956176758, "learning_rate": 4.102447667673215e-05, "loss": 0.1853, "num_input_tokens_seen": 2573128, "step": 13495 }, { "epoch": 7.016632016632016, "grad_norm": 0.2614802122116089, "learning_rate": 4.1015771533024215e-05, "loss": 0.051, "num_input_tokens_seen": 2574152, "step": 13500 }, { "epoch": 7.019230769230769, "grad_norm": 0.3390886187553406, "learning_rate": 4.100706309449042e-05, "loss": 0.0495, "num_input_tokens_seen": 2575016, "step": 13505 }, { "epoch": 7.021829521829522, "grad_norm": 0.11367063969373703, "learning_rate": 4.099835136292231e-05, "loss": 0.1442, "num_input_tokens_seen": 2575912, "step": 13510 }, { "epoch": 7.024428274428274, "grad_norm": 0.3766651749610901, "learning_rate": 4.098963634011207e-05, "loss": 0.1058, "num_input_tokens_seen": 2576936, "step": 13515 }, { "epoch": 7.027027027027027, "grad_norm": 0.16788457334041595, "learning_rate": 4.09809180278526e-05, "loss": 0.17, "num_input_tokens_seen": 2577864, "step": 13520 }, { "epoch": 7.029625779625779, "grad_norm": 0.9944678544998169, "learning_rate": 4.0972196427937474e-05, "loss": 0.1724, "num_input_tokens_seen": 2578824, "step": 13525 }, { "epoch": 7.032224532224532, "grad_norm": 0.28711220622062683, "learning_rate": 4.096347154216092e-05, "loss": 0.176, "num_input_tokens_seen": 2579816, "step": 13530 }, { "epoch": 7.034823284823285, "grad_norm": 1.010288953781128, "learning_rate": 4.095474337231788e-05, "loss": 0.1289, "num_input_tokens_seen": 2580712, "step": 13535 }, { "epoch": 7.037422037422037, "grad_norm": 0.634878933429718, "learning_rate": 4.094601192020391e-05, "loss": 0.1289, "num_input_tokens_seen": 2581736, "step": 13540 }, { "epoch": 7.04002079002079, "grad_norm": 0.2181311547756195, "learning_rate": 4.09372771876153e-05, "loss": 0.1176, "num_input_tokens_seen": 2582696, "step": 13545 }, { "epoch": 7.042619542619542, "grad_norm": 0.4754558205604553, "learning_rate": 4.0928539176348986e-05, "loss": 0.0697, "num_input_tokens_seen": 2583592, "step": 13550 }, { "epoch": 7.045218295218295, "grad_norm": 0.42450618743896484, "learning_rate": 4.091979788820258e-05, "loss": 0.1304, "num_input_tokens_seen": 2584552, "step": 13555 }, { "epoch": 7.047817047817047, "grad_norm": 0.530154287815094, "learning_rate": 4.091105332497439e-05, "loss": 0.0958, "num_input_tokens_seen": 2585576, "step": 13560 }, { "epoch": 7.0504158004158, "grad_norm": 1.248270869255066, "learning_rate": 4.090230548846338e-05, "loss": 0.1163, "num_input_tokens_seen": 2586664, "step": 13565 }, { "epoch": 7.053014553014553, "grad_norm": 0.7672033309936523, "learning_rate": 4.089355438046916e-05, "loss": 0.0822, "num_input_tokens_seen": 2587624, "step": 13570 }, { "epoch": 7.0556133056133055, "grad_norm": 0.2074844390153885, "learning_rate": 4.088480000279207e-05, "loss": 0.0886, "num_input_tokens_seen": 2588584, "step": 13575 }, { "epoch": 7.058212058212058, "grad_norm": 0.23187042772769928, "learning_rate": 4.0876042357233084e-05, "loss": 0.1309, "num_input_tokens_seen": 2589512, "step": 13580 }, { "epoch": 7.0608108108108105, "grad_norm": 1.085688591003418, "learning_rate": 4.086728144559386e-05, "loss": 0.1111, "num_input_tokens_seen": 2590440, "step": 13585 }, { "epoch": 7.0634095634095635, "grad_norm": 0.7673143744468689, "learning_rate": 4.0858517269676725e-05, "loss": 0.1143, "num_input_tokens_seen": 2591368, "step": 13590 }, { "epoch": 7.066008316008316, "grad_norm": 0.697765588760376, "learning_rate": 4.084974983128468e-05, "loss": 0.1692, "num_input_tokens_seen": 2592360, "step": 13595 }, { "epoch": 7.0686070686070686, "grad_norm": 0.8717983961105347, "learning_rate": 4.08409791322214e-05, "loss": 0.1137, "num_input_tokens_seen": 2593256, "step": 13600 }, { "epoch": 7.0712058212058215, "grad_norm": 0.19381920993328094, "learning_rate": 4.083220517429122e-05, "loss": 0.0631, "num_input_tokens_seen": 2594184, "step": 13605 }, { "epoch": 7.073804573804574, "grad_norm": 0.2667998671531677, "learning_rate": 4.082342795929915e-05, "loss": 0.1823, "num_input_tokens_seen": 2595048, "step": 13610 }, { "epoch": 7.076403326403327, "grad_norm": 1.0410414934158325, "learning_rate": 4.081464748905088e-05, "loss": 0.1398, "num_input_tokens_seen": 2595976, "step": 13615 }, { "epoch": 7.079002079002079, "grad_norm": 0.2698492109775543, "learning_rate": 4.080586376535276e-05, "loss": 0.1344, "num_input_tokens_seen": 2597064, "step": 13620 }, { "epoch": 7.081600831600832, "grad_norm": 1.4637932777404785, "learning_rate": 4.0797076790011804e-05, "loss": 0.2125, "num_input_tokens_seen": 2598088, "step": 13625 }, { "epoch": 7.084199584199585, "grad_norm": 0.1475972682237625, "learning_rate": 4.078828656483572e-05, "loss": 0.1574, "num_input_tokens_seen": 2599016, "step": 13630 }, { "epoch": 7.086798336798337, "grad_norm": 0.7099676728248596, "learning_rate": 4.077949309163285e-05, "loss": 0.1889, "num_input_tokens_seen": 2599944, "step": 13635 }, { "epoch": 7.08939708939709, "grad_norm": 0.6743635535240173, "learning_rate": 4.077069637221222e-05, "loss": 0.1608, "num_input_tokens_seen": 2600872, "step": 13640 }, { "epoch": 7.091995841995842, "grad_norm": 2.22090744972229, "learning_rate": 4.0761896408383535e-05, "loss": 0.1485, "num_input_tokens_seen": 2601896, "step": 13645 }, { "epoch": 7.094594594594595, "grad_norm": 0.670953094959259, "learning_rate": 4.075309320195715e-05, "loss": 0.0918, "num_input_tokens_seen": 2602856, "step": 13650 }, { "epoch": 7.097193347193347, "grad_norm": 0.7239810824394226, "learning_rate": 4.0744286754744104e-05, "loss": 0.095, "num_input_tokens_seen": 2603784, "step": 13655 }, { "epoch": 7.0997920997921, "grad_norm": 0.7632430791854858, "learning_rate": 4.073547706855608e-05, "loss": 0.1329, "num_input_tokens_seen": 2604744, "step": 13660 }, { "epoch": 7.102390852390853, "grad_norm": 1.0854074954986572, "learning_rate": 4.072666414520546e-05, "loss": 0.1374, "num_input_tokens_seen": 2605736, "step": 13665 }, { "epoch": 7.104989604989605, "grad_norm": 0.365337997674942, "learning_rate": 4.071784798650525e-05, "loss": 0.0602, "num_input_tokens_seen": 2606664, "step": 13670 }, { "epoch": 7.107588357588358, "grad_norm": 0.32117506861686707, "learning_rate": 4.070902859426915e-05, "loss": 0.1105, "num_input_tokens_seen": 2607592, "step": 13675 }, { "epoch": 7.11018711018711, "grad_norm": 1.1175001859664917, "learning_rate": 4.0700205970311526e-05, "loss": 0.2384, "num_input_tokens_seen": 2608584, "step": 13680 }, { "epoch": 7.112785862785863, "grad_norm": 0.8167929649353027, "learning_rate": 4.06913801164474e-05, "loss": 0.1881, "num_input_tokens_seen": 2609512, "step": 13685 }, { "epoch": 7.115384615384615, "grad_norm": 0.9480805397033691, "learning_rate": 4.068255103449246e-05, "loss": 0.071, "num_input_tokens_seen": 2610440, "step": 13690 }, { "epoch": 7.117983367983368, "grad_norm": 1.847684621810913, "learning_rate": 4.0673718726263057e-05, "loss": 0.1534, "num_input_tokens_seen": 2611432, "step": 13695 }, { "epoch": 7.120582120582121, "grad_norm": 0.3337959051132202, "learning_rate": 4.066488319357621e-05, "loss": 0.132, "num_input_tokens_seen": 2612328, "step": 13700 }, { "epoch": 7.123180873180873, "grad_norm": 1.4200931787490845, "learning_rate": 4.0656044438249594e-05, "loss": 0.1157, "num_input_tokens_seen": 2613224, "step": 13705 }, { "epoch": 7.125779625779626, "grad_norm": 0.5147699117660522, "learning_rate": 4.064720246210155e-05, "loss": 0.1236, "num_input_tokens_seen": 2614120, "step": 13710 }, { "epoch": 7.128378378378378, "grad_norm": 1.2870018482208252, "learning_rate": 4.063835726695109e-05, "loss": 0.1675, "num_input_tokens_seen": 2615080, "step": 13715 }, { "epoch": 7.130977130977131, "grad_norm": 0.297615647315979, "learning_rate": 4.062950885461787e-05, "loss": 0.1057, "num_input_tokens_seen": 2615976, "step": 13720 }, { "epoch": 7.133575883575883, "grad_norm": 0.5720934271812439, "learning_rate": 4.0620657226922245e-05, "loss": 0.2248, "num_input_tokens_seen": 2616904, "step": 13725 }, { "epoch": 7.136174636174636, "grad_norm": 0.46151015162467957, "learning_rate": 4.0611802385685185e-05, "loss": 0.0576, "num_input_tokens_seen": 2617800, "step": 13730 }, { "epoch": 7.138773388773389, "grad_norm": 1.1769428253173828, "learning_rate": 4.060294433272833e-05, "loss": 0.265, "num_input_tokens_seen": 2618728, "step": 13735 }, { "epoch": 7.141372141372141, "grad_norm": 1.1968600749969482, "learning_rate": 4.059408306987402e-05, "loss": 0.2667, "num_input_tokens_seen": 2619624, "step": 13740 }, { "epoch": 7.143970893970894, "grad_norm": 0.48619866371154785, "learning_rate": 4.058521859894522e-05, "loss": 0.152, "num_input_tokens_seen": 2620552, "step": 13745 }, { "epoch": 7.146569646569646, "grad_norm": 1.240760087966919, "learning_rate": 4.057635092176555e-05, "loss": 0.2776, "num_input_tokens_seen": 2621480, "step": 13750 }, { "epoch": 7.149168399168399, "grad_norm": 0.5625585913658142, "learning_rate": 4.056748004015931e-05, "loss": 0.1241, "num_input_tokens_seen": 2622472, "step": 13755 }, { "epoch": 7.151767151767152, "grad_norm": 0.26561933755874634, "learning_rate": 4.0558605955951466e-05, "loss": 0.1603, "num_input_tokens_seen": 2623464, "step": 13760 }, { "epoch": 7.154365904365904, "grad_norm": 0.684436559677124, "learning_rate": 4.05497286709676e-05, "loss": 0.1528, "num_input_tokens_seen": 2624424, "step": 13765 }, { "epoch": 7.156964656964657, "grad_norm": 1.0035045146942139, "learning_rate": 4.054084818703401e-05, "loss": 0.24, "num_input_tokens_seen": 2625352, "step": 13770 }, { "epoch": 7.159563409563409, "grad_norm": 0.38899293541908264, "learning_rate": 4.0531964505977603e-05, "loss": 0.1674, "num_input_tokens_seen": 2626344, "step": 13775 }, { "epoch": 7.162162162162162, "grad_norm": 1.115031123161316, "learning_rate": 4.052307762962597e-05, "loss": 0.276, "num_input_tokens_seen": 2627368, "step": 13780 }, { "epoch": 7.164760914760914, "grad_norm": 0.2126745581626892, "learning_rate": 4.051418755980736e-05, "loss": 0.1192, "num_input_tokens_seen": 2628296, "step": 13785 }, { "epoch": 7.167359667359667, "grad_norm": 0.15102531015872955, "learning_rate": 4.050529429835066e-05, "loss": 0.0955, "num_input_tokens_seen": 2629256, "step": 13790 }, { "epoch": 7.16995841995842, "grad_norm": 0.6460509896278381, "learning_rate": 4.0496397847085435e-05, "loss": 0.1528, "num_input_tokens_seen": 2630216, "step": 13795 }, { "epoch": 7.172557172557172, "grad_norm": 0.8088229298591614, "learning_rate": 4.04874982078419e-05, "loss": 0.0981, "num_input_tokens_seen": 2631144, "step": 13800 }, { "epoch": 7.175155925155925, "grad_norm": 0.7951326370239258, "learning_rate": 4.047859538245091e-05, "loss": 0.1664, "num_input_tokens_seen": 2632136, "step": 13805 }, { "epoch": 7.1777546777546775, "grad_norm": 0.45554259419441223, "learning_rate": 4.046968937274399e-05, "loss": 0.1204, "num_input_tokens_seen": 2633032, "step": 13810 }, { "epoch": 7.18035343035343, "grad_norm": 0.4026013910770416, "learning_rate": 4.0460780180553336e-05, "loss": 0.1566, "num_input_tokens_seen": 2633960, "step": 13815 }, { "epoch": 7.182952182952183, "grad_norm": 1.3063709735870361, "learning_rate": 4.045186780771176e-05, "loss": 0.1266, "num_input_tokens_seen": 2634952, "step": 13820 }, { "epoch": 7.1855509355509355, "grad_norm": 0.4851328134536743, "learning_rate": 4.044295225605277e-05, "loss": 0.1135, "num_input_tokens_seen": 2635912, "step": 13825 }, { "epoch": 7.1881496881496885, "grad_norm": 0.384872704744339, "learning_rate": 4.043403352741049e-05, "loss": 0.0521, "num_input_tokens_seen": 2636840, "step": 13830 }, { "epoch": 7.1907484407484406, "grad_norm": 0.3603074848651886, "learning_rate": 4.042511162361971e-05, "loss": 0.1048, "num_input_tokens_seen": 2637768, "step": 13835 }, { "epoch": 7.1933471933471935, "grad_norm": 0.11755399405956268, "learning_rate": 4.04161865465159e-05, "loss": 0.1321, "num_input_tokens_seen": 2638760, "step": 13840 }, { "epoch": 7.195945945945946, "grad_norm": 1.0553312301635742, "learning_rate": 4.0407258297935146e-05, "loss": 0.138, "num_input_tokens_seen": 2639720, "step": 13845 }, { "epoch": 7.198544698544699, "grad_norm": 0.41428491473197937, "learning_rate": 4.03983268797142e-05, "loss": 0.2008, "num_input_tokens_seen": 2640680, "step": 13850 }, { "epoch": 7.201143451143452, "grad_norm": 0.6127001047134399, "learning_rate": 4.038939229369049e-05, "loss": 0.0629, "num_input_tokens_seen": 2641576, "step": 13855 }, { "epoch": 7.203742203742204, "grad_norm": 0.6938425898551941, "learning_rate": 4.0380454541702044e-05, "loss": 0.0988, "num_input_tokens_seen": 2642568, "step": 13860 }, { "epoch": 7.206340956340957, "grad_norm": 0.12209427356719971, "learning_rate": 4.037151362558759e-05, "loss": 0.0901, "num_input_tokens_seen": 2643560, "step": 13865 }, { "epoch": 7.208939708939709, "grad_norm": 1.2747498750686646, "learning_rate": 4.036256954718647e-05, "loss": 0.1482, "num_input_tokens_seen": 2644552, "step": 13870 }, { "epoch": 7.211538461538462, "grad_norm": 0.584426999092102, "learning_rate": 4.0353622308338715e-05, "loss": 0.183, "num_input_tokens_seen": 2645448, "step": 13875 }, { "epoch": 7.214137214137214, "grad_norm": 1.512607216835022, "learning_rate": 4.034467191088497e-05, "loss": 0.1366, "num_input_tokens_seen": 2646408, "step": 13880 }, { "epoch": 7.216735966735967, "grad_norm": 0.7791837453842163, "learning_rate": 4.033571835666655e-05, "loss": 0.0885, "num_input_tokens_seen": 2647336, "step": 13885 }, { "epoch": 7.21933471933472, "grad_norm": 0.10033722221851349, "learning_rate": 4.032676164752541e-05, "loss": 0.1291, "num_input_tokens_seen": 2648232, "step": 13890 }, { "epoch": 7.221933471933472, "grad_norm": 0.13751111924648285, "learning_rate": 4.031780178530417e-05, "loss": 0.1228, "num_input_tokens_seen": 2649256, "step": 13895 }, { "epoch": 7.224532224532225, "grad_norm": 0.10007987171411514, "learning_rate": 4.030883877184608e-05, "loss": 0.1381, "num_input_tokens_seen": 2650216, "step": 13900 }, { "epoch": 7.227130977130977, "grad_norm": 1.2798049449920654, "learning_rate": 4.029987260899504e-05, "loss": 0.1194, "num_input_tokens_seen": 2651144, "step": 13905 }, { "epoch": 7.22972972972973, "grad_norm": 0.7322436571121216, "learning_rate": 4.029090329859562e-05, "loss": 0.0462, "num_input_tokens_seen": 2652040, "step": 13910 }, { "epoch": 7.232328482328482, "grad_norm": 1.050417184829712, "learning_rate": 4.028193084249299e-05, "loss": 0.2236, "num_input_tokens_seen": 2652968, "step": 13915 }, { "epoch": 7.234927234927235, "grad_norm": 0.6474205851554871, "learning_rate": 4.027295524253303e-05, "loss": 0.1555, "num_input_tokens_seen": 2653992, "step": 13920 }, { "epoch": 7.237525987525988, "grad_norm": 0.6027494668960571, "learning_rate": 4.0263976500562225e-05, "loss": 0.2456, "num_input_tokens_seen": 2655080, "step": 13925 }, { "epoch": 7.24012474012474, "grad_norm": 0.43993058800697327, "learning_rate": 4.025499461842771e-05, "loss": 0.1187, "num_input_tokens_seen": 2656040, "step": 13930 }, { "epoch": 7.242723492723493, "grad_norm": 0.4784090518951416, "learning_rate": 4.024600959797728e-05, "loss": 0.0693, "num_input_tokens_seen": 2657000, "step": 13935 }, { "epoch": 7.245322245322245, "grad_norm": 0.17847950756549835, "learning_rate": 4.023702144105936e-05, "loss": 0.1365, "num_input_tokens_seen": 2657928, "step": 13940 }, { "epoch": 7.247920997920998, "grad_norm": 0.19100822508335114, "learning_rate": 4.022803014952303e-05, "loss": 0.1152, "num_input_tokens_seen": 2658824, "step": 13945 }, { "epoch": 7.25051975051975, "grad_norm": 0.25742512941360474, "learning_rate": 4.021903572521802e-05, "loss": 0.1306, "num_input_tokens_seen": 2659784, "step": 13950 }, { "epoch": 7.253118503118503, "grad_norm": 1.9855893850326538, "learning_rate": 4.021003816999469e-05, "loss": 0.1887, "num_input_tokens_seen": 2660680, "step": 13955 }, { "epoch": 7.255717255717256, "grad_norm": 0.6345324516296387, "learning_rate": 4.020103748570405e-05, "loss": 0.1676, "num_input_tokens_seen": 2661640, "step": 13960 }, { "epoch": 7.258316008316008, "grad_norm": 0.36714568734169006, "learning_rate": 4.019203367419776e-05, "loss": 0.1271, "num_input_tokens_seen": 2662536, "step": 13965 }, { "epoch": 7.260914760914761, "grad_norm": 0.5218799114227295, "learning_rate": 4.018302673732812e-05, "loss": 0.1201, "num_input_tokens_seen": 2663560, "step": 13970 }, { "epoch": 7.263513513513513, "grad_norm": 0.2653793394565582, "learning_rate": 4.017401667694807e-05, "loss": 0.0666, "num_input_tokens_seen": 2664552, "step": 13975 }, { "epoch": 7.266112266112266, "grad_norm": 0.3611149191856384, "learning_rate": 4.0165003494911194e-05, "loss": 0.1179, "num_input_tokens_seen": 2665544, "step": 13980 }, { "epoch": 7.268711018711019, "grad_norm": 0.44437873363494873, "learning_rate": 4.015598719307171e-05, "loss": 0.1241, "num_input_tokens_seen": 2666440, "step": 13985 }, { "epoch": 7.271309771309771, "grad_norm": 0.6040059924125671, "learning_rate": 4.01469677732845e-05, "loss": 0.0824, "num_input_tokens_seen": 2667336, "step": 13990 }, { "epoch": 7.273908523908524, "grad_norm": 0.4461532533168793, "learning_rate": 4.013794523740507e-05, "loss": 0.1072, "num_input_tokens_seen": 2668296, "step": 13995 }, { "epoch": 7.276507276507276, "grad_norm": 0.6360507607460022, "learning_rate": 4.012891958728956e-05, "loss": 0.0973, "num_input_tokens_seen": 2669224, "step": 14000 }, { "epoch": 7.279106029106029, "grad_norm": 0.5005232095718384, "learning_rate": 4.011989082479477e-05, "loss": 0.155, "num_input_tokens_seen": 2670216, "step": 14005 }, { "epoch": 7.281704781704781, "grad_norm": 0.9427928924560547, "learning_rate": 4.011085895177814e-05, "loss": 0.183, "num_input_tokens_seen": 2671144, "step": 14010 }, { "epoch": 7.284303534303534, "grad_norm": 0.6313673257827759, "learning_rate": 4.0101823970097726e-05, "loss": 0.2329, "num_input_tokens_seen": 2672104, "step": 14015 }, { "epoch": 7.286902286902287, "grad_norm": 1.2851548194885254, "learning_rate": 4.0092785881612236e-05, "loss": 0.184, "num_input_tokens_seen": 2673000, "step": 14020 }, { "epoch": 7.289501039501039, "grad_norm": 0.12063562124967575, "learning_rate": 4.008374468818104e-05, "loss": 0.061, "num_input_tokens_seen": 2673960, "step": 14025 }, { "epoch": 7.292099792099792, "grad_norm": 0.22903268039226532, "learning_rate": 4.007470039166412e-05, "loss": 0.1997, "num_input_tokens_seen": 2674920, "step": 14030 }, { "epoch": 7.294698544698544, "grad_norm": 0.5972993969917297, "learning_rate": 4.00656529939221e-05, "loss": 0.1431, "num_input_tokens_seen": 2675816, "step": 14035 }, { "epoch": 7.297297297297297, "grad_norm": 1.4025706052780151, "learning_rate": 4.0056602496816236e-05, "loss": 0.2459, "num_input_tokens_seen": 2676808, "step": 14040 }, { "epoch": 7.29989604989605, "grad_norm": 0.8575888276100159, "learning_rate": 4.004754890220844e-05, "loss": 0.2238, "num_input_tokens_seen": 2677768, "step": 14045 }, { "epoch": 7.302494802494802, "grad_norm": 0.41510477662086487, "learning_rate": 4.003849221196126e-05, "loss": 0.1705, "num_input_tokens_seen": 2678728, "step": 14050 }, { "epoch": 7.305093555093555, "grad_norm": 1.1277623176574707, "learning_rate": 4.0029432427937864e-05, "loss": 0.2562, "num_input_tokens_seen": 2679720, "step": 14055 }, { "epoch": 7.3076923076923075, "grad_norm": 0.6500293016433716, "learning_rate": 4.0020369552002064e-05, "loss": 0.0948, "num_input_tokens_seen": 2680552, "step": 14060 }, { "epoch": 7.3102910602910605, "grad_norm": 0.7973259687423706, "learning_rate": 4.001130358601831e-05, "loss": 0.1663, "num_input_tokens_seen": 2681480, "step": 14065 }, { "epoch": 7.3128898128898125, "grad_norm": 0.12230261415243149, "learning_rate": 4.000223453185169e-05, "loss": 0.0542, "num_input_tokens_seen": 2682504, "step": 14070 }, { "epoch": 7.3154885654885655, "grad_norm": 0.48174548149108887, "learning_rate": 3.9993162391367926e-05, "loss": 0.1046, "num_input_tokens_seen": 2683464, "step": 14075 }, { "epoch": 7.3180873180873185, "grad_norm": 0.2840946614742279, "learning_rate": 3.998408716643337e-05, "loss": 0.0781, "num_input_tokens_seen": 2684424, "step": 14080 }, { "epoch": 7.320686070686071, "grad_norm": 1.699417233467102, "learning_rate": 3.997500885891502e-05, "loss": 0.2139, "num_input_tokens_seen": 2685384, "step": 14085 }, { "epoch": 7.3232848232848236, "grad_norm": 0.31873708963394165, "learning_rate": 3.996592747068049e-05, "loss": 0.1684, "num_input_tokens_seen": 2686280, "step": 14090 }, { "epoch": 7.325883575883576, "grad_norm": 1.347965121269226, "learning_rate": 3.995684300359804e-05, "loss": 0.0973, "num_input_tokens_seen": 2687272, "step": 14095 }, { "epoch": 7.328482328482329, "grad_norm": 0.8139423727989197, "learning_rate": 3.9947755459536564e-05, "loss": 0.1698, "num_input_tokens_seen": 2688168, "step": 14100 }, { "epoch": 7.331081081081081, "grad_norm": 0.5140500664710999, "learning_rate": 3.993866484036558e-05, "loss": 0.1127, "num_input_tokens_seen": 2689128, "step": 14105 }, { "epoch": 7.333679833679834, "grad_norm": 1.4950670003890991, "learning_rate": 3.9929571147955255e-05, "loss": 0.2261, "num_input_tokens_seen": 2690120, "step": 14110 }, { "epoch": 7.336278586278587, "grad_norm": 0.39222052693367004, "learning_rate": 3.9920474384176375e-05, "loss": 0.1346, "num_input_tokens_seen": 2691080, "step": 14115 }, { "epoch": 7.338877338877339, "grad_norm": 1.090261459350586, "learning_rate": 3.991137455090036e-05, "loss": 0.1945, "num_input_tokens_seen": 2692040, "step": 14120 }, { "epoch": 7.341476091476092, "grad_norm": 0.6402333974838257, "learning_rate": 3.990227164999926e-05, "loss": 0.1625, "num_input_tokens_seen": 2693000, "step": 14125 }, { "epoch": 7.344074844074844, "grad_norm": 1.0652495622634888, "learning_rate": 3.9893165683345765e-05, "loss": 0.091, "num_input_tokens_seen": 2694024, "step": 14130 }, { "epoch": 7.346673596673597, "grad_norm": 0.46481403708457947, "learning_rate": 3.9884056652813184e-05, "loss": 0.1547, "num_input_tokens_seen": 2695016, "step": 14135 }, { "epoch": 7.349272349272349, "grad_norm": 0.4886239469051361, "learning_rate": 3.987494456027546e-05, "loss": 0.0574, "num_input_tokens_seen": 2695912, "step": 14140 }, { "epoch": 7.351871101871102, "grad_norm": 0.6250771880149841, "learning_rate": 3.986582940760717e-05, "loss": 0.1099, "num_input_tokens_seen": 2696872, "step": 14145 }, { "epoch": 7.354469854469855, "grad_norm": 1.1760214567184448, "learning_rate": 3.9856711196683514e-05, "loss": 0.2102, "num_input_tokens_seen": 2697864, "step": 14150 }, { "epoch": 7.357068607068607, "grad_norm": 0.9937198758125305, "learning_rate": 3.984758992938034e-05, "loss": 0.0717, "num_input_tokens_seen": 2698792, "step": 14155 }, { "epoch": 7.35966735966736, "grad_norm": 0.16129715740680695, "learning_rate": 3.9838465607574096e-05, "loss": 0.1095, "num_input_tokens_seen": 2699784, "step": 14160 }, { "epoch": 7.362266112266112, "grad_norm": 0.3939419984817505, "learning_rate": 3.982933823314188e-05, "loss": 0.0452, "num_input_tokens_seen": 2700712, "step": 14165 }, { "epoch": 7.364864864864865, "grad_norm": 0.6911078095436096, "learning_rate": 3.98202078079614e-05, "loss": 0.1747, "num_input_tokens_seen": 2701672, "step": 14170 }, { "epoch": 7.367463617463617, "grad_norm": 0.5187019109725952, "learning_rate": 3.981107433391101e-05, "loss": 0.2182, "num_input_tokens_seen": 2702664, "step": 14175 }, { "epoch": 7.37006237006237, "grad_norm": 0.8758574724197388, "learning_rate": 3.980193781286968e-05, "loss": 0.1735, "num_input_tokens_seen": 2703560, "step": 14180 }, { "epoch": 7.372661122661123, "grad_norm": 0.6846583485603333, "learning_rate": 3.9792798246717025e-05, "loss": 0.1223, "num_input_tokens_seen": 2704584, "step": 14185 }, { "epoch": 7.375259875259875, "grad_norm": 0.16191846132278442, "learning_rate": 3.9783655637333245e-05, "loss": 0.1068, "num_input_tokens_seen": 2705544, "step": 14190 }, { "epoch": 7.377858627858628, "grad_norm": 0.87546706199646, "learning_rate": 3.9774509986599204e-05, "loss": 0.3039, "num_input_tokens_seen": 2706536, "step": 14195 }, { "epoch": 7.38045738045738, "grad_norm": 0.46854621171951294, "learning_rate": 3.9765361296396395e-05, "loss": 0.0837, "num_input_tokens_seen": 2707496, "step": 14200 }, { "epoch": 7.383056133056133, "grad_norm": 0.8467053771018982, "learning_rate": 3.9756209568606904e-05, "loss": 0.1283, "num_input_tokens_seen": 2708392, "step": 14205 }, { "epoch": 7.385654885654886, "grad_norm": 0.13205452263355255, "learning_rate": 3.974705480511346e-05, "loss": 0.167, "num_input_tokens_seen": 2709256, "step": 14210 }, { "epoch": 7.388253638253638, "grad_norm": 0.3882896304130554, "learning_rate": 3.973789700779943e-05, "loss": 0.1844, "num_input_tokens_seen": 2710184, "step": 14215 }, { "epoch": 7.390852390852391, "grad_norm": 0.49248772859573364, "learning_rate": 3.9728736178548775e-05, "loss": 0.0933, "num_input_tokens_seen": 2711144, "step": 14220 }, { "epoch": 7.393451143451143, "grad_norm": 0.5512486696243286, "learning_rate": 3.97195723192461e-05, "loss": 0.1018, "num_input_tokens_seen": 2712104, "step": 14225 }, { "epoch": 7.396049896049896, "grad_norm": 0.836677610874176, "learning_rate": 3.971040543177664e-05, "loss": 0.1598, "num_input_tokens_seen": 2713064, "step": 14230 }, { "epoch": 7.398648648648648, "grad_norm": 0.09249307215213776, "learning_rate": 3.970123551802623e-05, "loss": 0.0466, "num_input_tokens_seen": 2713896, "step": 14235 }, { "epoch": 7.401247401247401, "grad_norm": 1.8358557224273682, "learning_rate": 3.9692062579881346e-05, "loss": 0.1686, "num_input_tokens_seen": 2714824, "step": 14240 }, { "epoch": 7.403846153846154, "grad_norm": 0.27240803837776184, "learning_rate": 3.9682886619229075e-05, "loss": 0.1489, "num_input_tokens_seen": 2715752, "step": 14245 }, { "epoch": 7.406444906444906, "grad_norm": 2.334662437438965, "learning_rate": 3.967370763795714e-05, "loss": 0.3603, "num_input_tokens_seen": 2716648, "step": 14250 }, { "epoch": 7.409043659043659, "grad_norm": 0.40300458669662476, "learning_rate": 3.966452563795386e-05, "loss": 0.1556, "num_input_tokens_seen": 2717672, "step": 14255 }, { "epoch": 7.411642411642411, "grad_norm": 0.266002893447876, "learning_rate": 3.965534062110821e-05, "loss": 0.0329, "num_input_tokens_seen": 2718632, "step": 14260 }, { "epoch": 7.414241164241164, "grad_norm": 0.6197682023048401, "learning_rate": 3.964615258930975e-05, "loss": 0.1555, "num_input_tokens_seen": 2719624, "step": 14265 }, { "epoch": 7.416839916839917, "grad_norm": 0.2630367875099182, "learning_rate": 3.963696154444869e-05, "loss": 0.1618, "num_input_tokens_seen": 2720520, "step": 14270 }, { "epoch": 7.419438669438669, "grad_norm": 0.4903815686702728, "learning_rate": 3.962776748841584e-05, "loss": 0.1834, "num_input_tokens_seen": 2721448, "step": 14275 }, { "epoch": 7.422037422037422, "grad_norm": 0.7558334469795227, "learning_rate": 3.961857042310264e-05, "loss": 0.3308, "num_input_tokens_seen": 2722312, "step": 14280 }, { "epoch": 7.424636174636174, "grad_norm": 0.1648963838815689, "learning_rate": 3.960937035040115e-05, "loss": 0.1433, "num_input_tokens_seen": 2723240, "step": 14285 }, { "epoch": 7.427234927234927, "grad_norm": 0.6685584187507629, "learning_rate": 3.960016727220402e-05, "loss": 0.1111, "num_input_tokens_seen": 2724200, "step": 14290 }, { "epoch": 7.4298336798336795, "grad_norm": 0.5550060272216797, "learning_rate": 3.959096119040458e-05, "loss": 0.0699, "num_input_tokens_seen": 2725096, "step": 14295 }, { "epoch": 7.4324324324324325, "grad_norm": 1.025785207748413, "learning_rate": 3.958175210689671e-05, "loss": 0.1619, "num_input_tokens_seen": 2726056, "step": 14300 }, { "epoch": 7.435031185031185, "grad_norm": 0.10899965465068817, "learning_rate": 3.957254002357496e-05, "loss": 0.0862, "num_input_tokens_seen": 2726984, "step": 14305 }, { "epoch": 7.4376299376299375, "grad_norm": 0.9786275029182434, "learning_rate": 3.9563324942334454e-05, "loss": 0.1923, "num_input_tokens_seen": 2727880, "step": 14310 }, { "epoch": 7.4402286902286905, "grad_norm": 0.21233788132667542, "learning_rate": 3.955410686507096e-05, "loss": 0.0652, "num_input_tokens_seen": 2728872, "step": 14315 }, { "epoch": 7.442827442827443, "grad_norm": 0.6750558614730835, "learning_rate": 3.954488579368087e-05, "loss": 0.0528, "num_input_tokens_seen": 2729768, "step": 14320 }, { "epoch": 7.4454261954261955, "grad_norm": 0.8694570064544678, "learning_rate": 3.9535661730061155e-05, "loss": 0.1546, "num_input_tokens_seen": 2730760, "step": 14325 }, { "epoch": 7.448024948024948, "grad_norm": 0.730588436126709, "learning_rate": 3.952643467610945e-05, "loss": 0.1278, "num_input_tokens_seen": 2731752, "step": 14330 }, { "epoch": 7.450623700623701, "grad_norm": 0.2819216549396515, "learning_rate": 3.951720463372396e-05, "loss": 0.1517, "num_input_tokens_seen": 2732776, "step": 14335 }, { "epoch": 7.453222453222454, "grad_norm": 0.7548549175262451, "learning_rate": 3.950797160480354e-05, "loss": 0.1214, "num_input_tokens_seen": 2733704, "step": 14340 }, { "epoch": 7.455821205821206, "grad_norm": 0.41035687923431396, "learning_rate": 3.949873559124763e-05, "loss": 0.0694, "num_input_tokens_seen": 2734568, "step": 14345 }, { "epoch": 7.458419958419959, "grad_norm": 1.2070413827896118, "learning_rate": 3.948949659495631e-05, "loss": 0.2159, "num_input_tokens_seen": 2735528, "step": 14350 }, { "epoch": 7.461018711018711, "grad_norm": 1.6684781312942505, "learning_rate": 3.948025461783025e-05, "loss": 0.1533, "num_input_tokens_seen": 2736456, "step": 14355 }, { "epoch": 7.463617463617464, "grad_norm": 0.6942318081855774, "learning_rate": 3.9471009661770755e-05, "loss": 0.1236, "num_input_tokens_seen": 2737416, "step": 14360 }, { "epoch": 7.466216216216216, "grad_norm": 0.7847647666931152, "learning_rate": 3.946176172867973e-05, "loss": 0.0562, "num_input_tokens_seen": 2738344, "step": 14365 }, { "epoch": 7.468814968814969, "grad_norm": 0.6081158518791199, "learning_rate": 3.945251082045969e-05, "loss": 0.1256, "num_input_tokens_seen": 2739304, "step": 14370 }, { "epoch": 7.471413721413722, "grad_norm": 0.6803690791130066, "learning_rate": 3.9443256939013775e-05, "loss": 0.1855, "num_input_tokens_seen": 2740232, "step": 14375 }, { "epoch": 7.474012474012474, "grad_norm": 0.17755047976970673, "learning_rate": 3.9434000086245724e-05, "loss": 0.0967, "num_input_tokens_seen": 2741128, "step": 14380 }, { "epoch": 7.476611226611227, "grad_norm": 1.1929874420166016, "learning_rate": 3.9424740264059904e-05, "loss": 0.2165, "num_input_tokens_seen": 2742056, "step": 14385 }, { "epoch": 7.479209979209979, "grad_norm": 1.0322521924972534, "learning_rate": 3.941547747436125e-05, "loss": 0.179, "num_input_tokens_seen": 2743048, "step": 14390 }, { "epoch": 7.481808731808732, "grad_norm": 0.5417822599411011, "learning_rate": 3.9406211719055375e-05, "loss": 0.1418, "num_input_tokens_seen": 2743944, "step": 14395 }, { "epoch": 7.484407484407485, "grad_norm": 1.0505324602127075, "learning_rate": 3.9396943000048437e-05, "loss": 0.1624, "num_input_tokens_seen": 2744936, "step": 14400 }, { "epoch": 7.487006237006237, "grad_norm": 0.16848066449165344, "learning_rate": 3.9387671319247256e-05, "loss": 0.0687, "num_input_tokens_seen": 2745896, "step": 14405 }, { "epoch": 7.48960498960499, "grad_norm": 0.20414428412914276, "learning_rate": 3.937839667855922e-05, "loss": 0.1066, "num_input_tokens_seen": 2746888, "step": 14410 }, { "epoch": 7.492203742203742, "grad_norm": 1.1424108743667603, "learning_rate": 3.936911907989236e-05, "loss": 0.2948, "num_input_tokens_seen": 2747848, "step": 14415 }, { "epoch": 7.494802494802495, "grad_norm": 0.31161022186279297, "learning_rate": 3.935983852515528e-05, "loss": 0.1033, "num_input_tokens_seen": 2748744, "step": 14420 }, { "epoch": 7.497401247401247, "grad_norm": 0.16077233850955963, "learning_rate": 3.935055501625722e-05, "loss": 0.0852, "num_input_tokens_seen": 2749704, "step": 14425 }, { "epoch": 7.5, "grad_norm": 0.2109048068523407, "learning_rate": 3.9341268555108024e-05, "loss": 0.0681, "num_input_tokens_seen": 2750664, "step": 14430 }, { "epoch": 7.502598752598753, "grad_norm": 0.12004071474075317, "learning_rate": 3.9331979143618134e-05, "loss": 0.2095, "num_input_tokens_seen": 2751624, "step": 14435 }, { "epoch": 7.505197505197505, "grad_norm": 0.3273334205150604, "learning_rate": 3.9322686783698604e-05, "loss": 0.2544, "num_input_tokens_seen": 2752616, "step": 14440 }, { "epoch": 7.507796257796258, "grad_norm": 0.8388659358024597, "learning_rate": 3.9313391477261084e-05, "loss": 0.1909, "num_input_tokens_seen": 2753576, "step": 14445 }, { "epoch": 7.51039501039501, "grad_norm": 0.21220828592777252, "learning_rate": 3.9304093226217865e-05, "loss": 0.2658, "num_input_tokens_seen": 2754504, "step": 14450 }, { "epoch": 7.512993762993763, "grad_norm": 0.1845872402191162, "learning_rate": 3.9294792032481796e-05, "loss": 0.1628, "num_input_tokens_seen": 2755496, "step": 14455 }, { "epoch": 7.515592515592516, "grad_norm": 0.4771318733692169, "learning_rate": 3.928548789796636e-05, "loss": 0.1155, "num_input_tokens_seen": 2756552, "step": 14460 }, { "epoch": 7.518191268191268, "grad_norm": 0.3136962950229645, "learning_rate": 3.927618082458564e-05, "loss": 0.0439, "num_input_tokens_seen": 2757416, "step": 14465 }, { "epoch": 7.520790020790021, "grad_norm": 0.07635185867547989, "learning_rate": 3.926687081425433e-05, "loss": 0.1458, "num_input_tokens_seen": 2758376, "step": 14470 }, { "epoch": 7.523388773388773, "grad_norm": 0.8326581716537476, "learning_rate": 3.925755786888771e-05, "loss": 0.1294, "num_input_tokens_seen": 2759272, "step": 14475 }, { "epoch": 7.525987525987526, "grad_norm": 0.38803017139434814, "learning_rate": 3.9248241990401676e-05, "loss": 0.0568, "num_input_tokens_seen": 2760168, "step": 14480 }, { "epoch": 7.528586278586278, "grad_norm": 0.20416943728923798, "learning_rate": 3.923892318071274e-05, "loss": 0.1152, "num_input_tokens_seen": 2761096, "step": 14485 }, { "epoch": 7.531185031185031, "grad_norm": 0.09142003208398819, "learning_rate": 3.9229601441737986e-05, "loss": 0.1159, "num_input_tokens_seen": 2762024, "step": 14490 }, { "epoch": 7.533783783783784, "grad_norm": 0.9830790758132935, "learning_rate": 3.922027677539512e-05, "loss": 0.1717, "num_input_tokens_seen": 2762952, "step": 14495 }, { "epoch": 7.536382536382536, "grad_norm": 0.09436488151550293, "learning_rate": 3.921094918360245e-05, "loss": 0.0693, "num_input_tokens_seen": 2763848, "step": 14500 }, { "epoch": 7.538981288981289, "grad_norm": 0.7896081805229187, "learning_rate": 3.920161866827889e-05, "loss": 0.173, "num_input_tokens_seen": 2764872, "step": 14505 }, { "epoch": 7.541580041580041, "grad_norm": 0.29309940338134766, "learning_rate": 3.919228523134395e-05, "loss": 0.1627, "num_input_tokens_seen": 2765864, "step": 14510 }, { "epoch": 7.544178794178794, "grad_norm": 0.5416246056556702, "learning_rate": 3.9182948874717736e-05, "loss": 0.1239, "num_input_tokens_seen": 2766792, "step": 14515 }, { "epoch": 7.546777546777546, "grad_norm": 0.148263081908226, "learning_rate": 3.9173609600320966e-05, "loss": 0.1712, "num_input_tokens_seen": 2767784, "step": 14520 }, { "epoch": 7.549376299376299, "grad_norm": 0.24744988977909088, "learning_rate": 3.916426741007493e-05, "loss": 0.0618, "num_input_tokens_seen": 2768712, "step": 14525 }, { "epoch": 7.551975051975052, "grad_norm": 0.47417765855789185, "learning_rate": 3.9154922305901565e-05, "loss": 0.0738, "num_input_tokens_seen": 2769608, "step": 14530 }, { "epoch": 7.5545738045738045, "grad_norm": 0.20166660845279694, "learning_rate": 3.9145574289723366e-05, "loss": 0.1206, "num_input_tokens_seen": 2770504, "step": 14535 }, { "epoch": 7.557172557172557, "grad_norm": 0.18377622961997986, "learning_rate": 3.913622336346344e-05, "loss": 0.192, "num_input_tokens_seen": 2771432, "step": 14540 }, { "epoch": 7.5597713097713095, "grad_norm": 0.3889837861061096, "learning_rate": 3.912686952904552e-05, "loss": 0.14, "num_input_tokens_seen": 2772360, "step": 14545 }, { "epoch": 7.5623700623700625, "grad_norm": 1.4498313665390015, "learning_rate": 3.911751278839388e-05, "loss": 0.1688, "num_input_tokens_seen": 2773320, "step": 14550 }, { "epoch": 7.564968814968815, "grad_norm": 2.0110671520233154, "learning_rate": 3.9108153143433446e-05, "loss": 0.2285, "num_input_tokens_seen": 2774248, "step": 14555 }, { "epoch": 7.5675675675675675, "grad_norm": 0.331320196390152, "learning_rate": 3.9098790596089704e-05, "loss": 0.1752, "num_input_tokens_seen": 2775272, "step": 14560 }, { "epoch": 7.5701663201663205, "grad_norm": 2.0045979022979736, "learning_rate": 3.908942514828877e-05, "loss": 0.2626, "num_input_tokens_seen": 2776200, "step": 14565 }, { "epoch": 7.572765072765073, "grad_norm": 0.28631022572517395, "learning_rate": 3.9080056801957335e-05, "loss": 0.159, "num_input_tokens_seen": 2777160, "step": 14570 }, { "epoch": 7.575363825363826, "grad_norm": 0.20993703603744507, "learning_rate": 3.907068555902268e-05, "loss": 0.0955, "num_input_tokens_seen": 2778152, "step": 14575 }, { "epoch": 7.577962577962578, "grad_norm": 0.13512416183948517, "learning_rate": 3.906131142141271e-05, "loss": 0.1255, "num_input_tokens_seen": 2779048, "step": 14580 }, { "epoch": 7.580561330561331, "grad_norm": 0.5395327806472778, "learning_rate": 3.90519343910559e-05, "loss": 0.0886, "num_input_tokens_seen": 2779976, "step": 14585 }, { "epoch": 7.583160083160083, "grad_norm": 0.5691362619400024, "learning_rate": 3.904255446988132e-05, "loss": 0.1349, "num_input_tokens_seen": 2781000, "step": 14590 }, { "epoch": 7.585758835758836, "grad_norm": 0.5109725594520569, "learning_rate": 3.9033171659818654e-05, "loss": 0.1222, "num_input_tokens_seen": 2782024, "step": 14595 }, { "epoch": 7.588357588357589, "grad_norm": 0.6926700472831726, "learning_rate": 3.9023785962798176e-05, "loss": 0.0998, "num_input_tokens_seen": 2782984, "step": 14600 }, { "epoch": 7.590956340956341, "grad_norm": 0.27339354157447815, "learning_rate": 3.901439738075073e-05, "loss": 0.1458, "num_input_tokens_seen": 2783944, "step": 14605 }, { "epoch": 7.593555093555094, "grad_norm": 0.5759967565536499, "learning_rate": 3.900500591560779e-05, "loss": 0.0812, "num_input_tokens_seen": 2784872, "step": 14610 }, { "epoch": 7.596153846153846, "grad_norm": 1.024625301361084, "learning_rate": 3.899561156930139e-05, "loss": 0.2704, "num_input_tokens_seen": 2785800, "step": 14615 }, { "epoch": 7.598752598752599, "grad_norm": 1.018791913986206, "learning_rate": 3.898621434376417e-05, "loss": 0.2894, "num_input_tokens_seen": 2786728, "step": 14620 }, { "epoch": 7.601351351351351, "grad_norm": 0.6283175945281982, "learning_rate": 3.897681424092937e-05, "loss": 0.1261, "num_input_tokens_seen": 2787656, "step": 14625 }, { "epoch": 7.603950103950104, "grad_norm": 0.5077320337295532, "learning_rate": 3.8967411262730815e-05, "loss": 0.1576, "num_input_tokens_seen": 2788584, "step": 14630 }, { "epoch": 7.606548856548857, "grad_norm": 0.2357829064130783, "learning_rate": 3.8958005411102924e-05, "loss": 0.1044, "num_input_tokens_seen": 2789608, "step": 14635 }, { "epoch": 7.609147609147609, "grad_norm": 0.09122277051210403, "learning_rate": 3.8948596687980696e-05, "loss": 0.1502, "num_input_tokens_seen": 2790664, "step": 14640 }, { "epoch": 7.611746361746362, "grad_norm": 0.7311398983001709, "learning_rate": 3.893918509529974e-05, "loss": 0.2215, "num_input_tokens_seen": 2791592, "step": 14645 }, { "epoch": 7.614345114345114, "grad_norm": 0.8254291415214539, "learning_rate": 3.892977063499623e-05, "loss": 0.1167, "num_input_tokens_seen": 2792488, "step": 14650 }, { "epoch": 7.616943866943867, "grad_norm": 1.8634679317474365, "learning_rate": 3.892035330900696e-05, "loss": 0.2023, "num_input_tokens_seen": 2793480, "step": 14655 }, { "epoch": 7.61954261954262, "grad_norm": 0.32210099697113037, "learning_rate": 3.89109331192693e-05, "loss": 0.1073, "num_input_tokens_seen": 2794408, "step": 14660 }, { "epoch": 7.622141372141372, "grad_norm": 0.3863310217857361, "learning_rate": 3.890151006772119e-05, "loss": 0.0823, "num_input_tokens_seen": 2795336, "step": 14665 }, { "epoch": 7.624740124740125, "grad_norm": 1.046456217765808, "learning_rate": 3.889208415630119e-05, "loss": 0.1453, "num_input_tokens_seen": 2796360, "step": 14670 }, { "epoch": 7.627338877338877, "grad_norm": 0.8136219382286072, "learning_rate": 3.888265538694844e-05, "loss": 0.1647, "num_input_tokens_seen": 2797256, "step": 14675 }, { "epoch": 7.62993762993763, "grad_norm": 0.8051929473876953, "learning_rate": 3.887322376160264e-05, "loss": 0.1416, "num_input_tokens_seen": 2798184, "step": 14680 }, { "epoch": 7.632536382536383, "grad_norm": 0.10510598868131638, "learning_rate": 3.886378928220412e-05, "loss": 0.1724, "num_input_tokens_seen": 2799144, "step": 14685 }, { "epoch": 7.635135135135135, "grad_norm": 1.1446564197540283, "learning_rate": 3.8854351950693765e-05, "loss": 0.1596, "num_input_tokens_seen": 2800136, "step": 14690 }, { "epoch": 7.637733887733888, "grad_norm": 0.2970135509967804, "learning_rate": 3.884491176901307e-05, "loss": 0.0816, "num_input_tokens_seen": 2801160, "step": 14695 }, { "epoch": 7.64033264033264, "grad_norm": 0.6468262076377869, "learning_rate": 3.8835468739104094e-05, "loss": 0.117, "num_input_tokens_seen": 2802088, "step": 14700 }, { "epoch": 7.642931392931393, "grad_norm": 2.1641812324523926, "learning_rate": 3.882602286290951e-05, "loss": 0.2389, "num_input_tokens_seen": 2803112, "step": 14705 }, { "epoch": 7.645530145530145, "grad_norm": 0.503109335899353, "learning_rate": 3.881657414237255e-05, "loss": 0.0718, "num_input_tokens_seen": 2804040, "step": 14710 }, { "epoch": 7.648128898128898, "grad_norm": 0.3435494899749756, "learning_rate": 3.880712257943703e-05, "loss": 0.2453, "num_input_tokens_seen": 2805032, "step": 14715 }, { "epoch": 7.650727650727651, "grad_norm": 0.6325545907020569, "learning_rate": 3.879766817604738e-05, "loss": 0.0881, "num_input_tokens_seen": 2805896, "step": 14720 }, { "epoch": 7.653326403326403, "grad_norm": 0.5061954259872437, "learning_rate": 3.878821093414858e-05, "loss": 0.2689, "num_input_tokens_seen": 2806824, "step": 14725 }, { "epoch": 7.655925155925156, "grad_norm": 0.15723249316215515, "learning_rate": 3.877875085568623e-05, "loss": 0.0944, "num_input_tokens_seen": 2807784, "step": 14730 }, { "epoch": 7.658523908523908, "grad_norm": 0.1296951174736023, "learning_rate": 3.8769287942606466e-05, "loss": 0.0911, "num_input_tokens_seen": 2808712, "step": 14735 }, { "epoch": 7.661122661122661, "grad_norm": 0.18896172940731049, "learning_rate": 3.875982219685607e-05, "loss": 0.1045, "num_input_tokens_seen": 2809672, "step": 14740 }, { "epoch": 7.663721413721413, "grad_norm": 0.5706918239593506, "learning_rate": 3.875035362038234e-05, "loss": 0.1366, "num_input_tokens_seen": 2810696, "step": 14745 }, { "epoch": 7.666320166320166, "grad_norm": 0.9652164578437805, "learning_rate": 3.87408822151332e-05, "loss": 0.0836, "num_input_tokens_seen": 2811624, "step": 14750 }, { "epoch": 7.668918918918919, "grad_norm": 0.33891215920448303, "learning_rate": 3.873140798305715e-05, "loss": 0.1385, "num_input_tokens_seen": 2812552, "step": 14755 }, { "epoch": 7.671517671517671, "grad_norm": 2.1285147666931152, "learning_rate": 3.8721930926103244e-05, "loss": 0.2478, "num_input_tokens_seen": 2813480, "step": 14760 }, { "epoch": 7.674116424116424, "grad_norm": 0.09786566346883774, "learning_rate": 3.871245104622116e-05, "loss": 0.0866, "num_input_tokens_seen": 2814440, "step": 14765 }, { "epoch": 7.6767151767151764, "grad_norm": 0.4573808014392853, "learning_rate": 3.8702968345361134e-05, "loss": 0.078, "num_input_tokens_seen": 2815336, "step": 14770 }, { "epoch": 7.679313929313929, "grad_norm": 0.2135246843099594, "learning_rate": 3.869348282547398e-05, "loss": 0.0953, "num_input_tokens_seen": 2816264, "step": 14775 }, { "epoch": 7.6819126819126815, "grad_norm": 0.7658607959747314, "learning_rate": 3.868399448851109e-05, "loss": 0.0958, "num_input_tokens_seen": 2817224, "step": 14780 }, { "epoch": 7.6845114345114345, "grad_norm": 0.9875488877296448, "learning_rate": 3.8674503336424436e-05, "loss": 0.1061, "num_input_tokens_seen": 2818152, "step": 14785 }, { "epoch": 7.6871101871101875, "grad_norm": 0.19878320395946503, "learning_rate": 3.866500937116659e-05, "loss": 0.1121, "num_input_tokens_seen": 2819048, "step": 14790 }, { "epoch": 7.6897089397089395, "grad_norm": 1.2392350435256958, "learning_rate": 3.865551259469069e-05, "loss": 0.2098, "num_input_tokens_seen": 2820040, "step": 14795 }, { "epoch": 7.6923076923076925, "grad_norm": 0.5260343551635742, "learning_rate": 3.864601300895042e-05, "loss": 0.0829, "num_input_tokens_seen": 2820968, "step": 14800 }, { "epoch": 7.694906444906445, "grad_norm": 0.14573518931865692, "learning_rate": 3.86365106159001e-05, "loss": 0.2098, "num_input_tokens_seen": 2821896, "step": 14805 }, { "epoch": 7.697505197505198, "grad_norm": 0.3590077757835388, "learning_rate": 3.8627005417494586e-05, "loss": 0.1094, "num_input_tokens_seen": 2822856, "step": 14810 }, { "epoch": 7.70010395010395, "grad_norm": 0.8671327233314514, "learning_rate": 3.861749741568933e-05, "loss": 0.1636, "num_input_tokens_seen": 2823944, "step": 14815 }, { "epoch": 7.702702702702703, "grad_norm": 0.9131045341491699, "learning_rate": 3.860798661244035e-05, "loss": 0.127, "num_input_tokens_seen": 2824936, "step": 14820 }, { "epoch": 7.705301455301456, "grad_norm": 0.6781425476074219, "learning_rate": 3.859847300970425e-05, "loss": 0.0698, "num_input_tokens_seen": 2825864, "step": 14825 }, { "epoch": 7.707900207900208, "grad_norm": 0.9701134562492371, "learning_rate": 3.8588956609438185e-05, "loss": 0.1886, "num_input_tokens_seen": 2826824, "step": 14830 }, { "epoch": 7.710498960498961, "grad_norm": 0.11611692607402802, "learning_rate": 3.8579437413599933e-05, "loss": 0.1167, "num_input_tokens_seen": 2827784, "step": 14835 }, { "epoch": 7.713097713097713, "grad_norm": 0.7052004337310791, "learning_rate": 3.856991542414781e-05, "loss": 0.1267, "num_input_tokens_seen": 2828712, "step": 14840 }, { "epoch": 7.715696465696466, "grad_norm": 1.336746096611023, "learning_rate": 3.856039064304071e-05, "loss": 0.1751, "num_input_tokens_seen": 2829672, "step": 14845 }, { "epoch": 7.718295218295218, "grad_norm": 0.7757983803749084, "learning_rate": 3.855086307223811e-05, "loss": 0.1556, "num_input_tokens_seen": 2830600, "step": 14850 }, { "epoch": 7.720893970893971, "grad_norm": 0.6988540887832642, "learning_rate": 3.8541332713700065e-05, "loss": 0.1478, "num_input_tokens_seen": 2831560, "step": 14855 }, { "epoch": 7.723492723492724, "grad_norm": 0.3157126307487488, "learning_rate": 3.853179956938718e-05, "loss": 0.1575, "num_input_tokens_seen": 2832488, "step": 14860 }, { "epoch": 7.726091476091476, "grad_norm": 0.8763447403907776, "learning_rate": 3.852226364126067e-05, "loss": 0.0732, "num_input_tokens_seen": 2833448, "step": 14865 }, { "epoch": 7.728690228690229, "grad_norm": 0.31388935446739197, "learning_rate": 3.851272493128229e-05, "loss": 0.0855, "num_input_tokens_seen": 2834376, "step": 14870 }, { "epoch": 7.731288981288982, "grad_norm": 0.8666530251502991, "learning_rate": 3.850318344141439e-05, "loss": 0.083, "num_input_tokens_seen": 2835272, "step": 14875 }, { "epoch": 7.733887733887734, "grad_norm": 0.6330180168151855, "learning_rate": 3.849363917361987e-05, "loss": 0.0988, "num_input_tokens_seen": 2836168, "step": 14880 }, { "epoch": 7.736486486486487, "grad_norm": 1.1355818510055542, "learning_rate": 3.848409212986222e-05, "loss": 0.2963, "num_input_tokens_seen": 2837096, "step": 14885 }, { "epoch": 7.739085239085239, "grad_norm": 1.1915358304977417, "learning_rate": 3.8474542312105485e-05, "loss": 0.1628, "num_input_tokens_seen": 2838024, "step": 14890 }, { "epoch": 7.741683991683992, "grad_norm": 0.120152048766613, "learning_rate": 3.846498972231431e-05, "loss": 0.0405, "num_input_tokens_seen": 2838984, "step": 14895 }, { "epoch": 7.744282744282744, "grad_norm": 0.31451091170310974, "learning_rate": 3.845543436245387e-05, "loss": 0.1493, "num_input_tokens_seen": 2839976, "step": 14900 }, { "epoch": 7.746881496881497, "grad_norm": 1.3660893440246582, "learning_rate": 3.844587623448995e-05, "loss": 0.1955, "num_input_tokens_seen": 2840936, "step": 14905 }, { "epoch": 7.74948024948025, "grad_norm": 0.21785187721252441, "learning_rate": 3.8436315340388875e-05, "loss": 0.1524, "num_input_tokens_seen": 2841928, "step": 14910 }, { "epoch": 7.752079002079002, "grad_norm": 0.8409267067909241, "learning_rate": 3.842675168211753e-05, "loss": 0.306, "num_input_tokens_seen": 2842920, "step": 14915 }, { "epoch": 7.754677754677755, "grad_norm": 0.35358792543411255, "learning_rate": 3.8417185261643417e-05, "loss": 0.1495, "num_input_tokens_seen": 2843848, "step": 14920 }, { "epoch": 7.757276507276507, "grad_norm": 0.26366284489631653, "learning_rate": 3.840761608093456e-05, "loss": 0.2498, "num_input_tokens_seen": 2844840, "step": 14925 }, { "epoch": 7.75987525987526, "grad_norm": 0.9151294231414795, "learning_rate": 3.839804414195958e-05, "loss": 0.2114, "num_input_tokens_seen": 2845768, "step": 14930 }, { "epoch": 7.762474012474012, "grad_norm": 0.5029309391975403, "learning_rate": 3.838846944668764e-05, "loss": 0.1163, "num_input_tokens_seen": 2846728, "step": 14935 }, { "epoch": 7.765072765072765, "grad_norm": 0.7434835433959961, "learning_rate": 3.837889199708848e-05, "loss": 0.1714, "num_input_tokens_seen": 2847688, "step": 14940 }, { "epoch": 7.767671517671518, "grad_norm": 0.5809860229492188, "learning_rate": 3.8369311795132435e-05, "loss": 0.0784, "num_input_tokens_seen": 2848648, "step": 14945 }, { "epoch": 7.77027027027027, "grad_norm": 0.28449106216430664, "learning_rate": 3.835972884279036e-05, "loss": 0.0887, "num_input_tokens_seen": 2849576, "step": 14950 }, { "epoch": 7.772869022869023, "grad_norm": 0.5467974543571472, "learning_rate": 3.835014314203369e-05, "loss": 0.1991, "num_input_tokens_seen": 2850568, "step": 14955 }, { "epoch": 7.775467775467775, "grad_norm": 1.101345419883728, "learning_rate": 3.834055469483444e-05, "loss": 0.2123, "num_input_tokens_seen": 2851496, "step": 14960 }, { "epoch": 7.778066528066528, "grad_norm": 0.4923240542411804, "learning_rate": 3.83309635031652e-05, "loss": 0.1432, "num_input_tokens_seen": 2852456, "step": 14965 }, { "epoch": 7.78066528066528, "grad_norm": 0.33598440885543823, "learning_rate": 3.8321369568999075e-05, "loss": 0.1169, "num_input_tokens_seen": 2853416, "step": 14970 }, { "epoch": 7.783264033264033, "grad_norm": 0.7087386846542358, "learning_rate": 3.831177289430979e-05, "loss": 0.134, "num_input_tokens_seen": 2854440, "step": 14975 }, { "epoch": 7.785862785862786, "grad_norm": 0.8152046799659729, "learning_rate": 3.8302173481071604e-05, "loss": 0.2169, "num_input_tokens_seen": 2855496, "step": 14980 }, { "epoch": 7.788461538461538, "grad_norm": 0.7240963578224182, "learning_rate": 3.829257133125933e-05, "loss": 0.1445, "num_input_tokens_seen": 2856456, "step": 14985 }, { "epoch": 7.791060291060291, "grad_norm": 1.5101304054260254, "learning_rate": 3.828296644684838e-05, "loss": 0.1939, "num_input_tokens_seen": 2857480, "step": 14990 }, { "epoch": 7.793659043659043, "grad_norm": 1.0511730909347534, "learning_rate": 3.8273358829814696e-05, "loss": 0.2235, "num_input_tokens_seen": 2858408, "step": 14995 }, { "epoch": 7.796257796257796, "grad_norm": 0.2670905888080597, "learning_rate": 3.826374848213481e-05, "loss": 0.0838, "num_input_tokens_seen": 2859336, "step": 15000 }, { "epoch": 7.798856548856548, "grad_norm": 0.9250243902206421, "learning_rate": 3.8254135405785776e-05, "loss": 0.1628, "num_input_tokens_seen": 2860360, "step": 15005 }, { "epoch": 7.801455301455301, "grad_norm": 1.1710237264633179, "learning_rate": 3.8244519602745245e-05, "loss": 0.2435, "num_input_tokens_seen": 2861320, "step": 15010 }, { "epoch": 7.804054054054054, "grad_norm": 0.1745443493127823, "learning_rate": 3.8234901074991416e-05, "loss": 0.0351, "num_input_tokens_seen": 2862280, "step": 15015 }, { "epoch": 7.8066528066528065, "grad_norm": 1.544271469116211, "learning_rate": 3.8225279824503034e-05, "loss": 0.2105, "num_input_tokens_seen": 2863208, "step": 15020 }, { "epoch": 7.8092515592515594, "grad_norm": 0.5035855770111084, "learning_rate": 3.821565585325946e-05, "loss": 0.0801, "num_input_tokens_seen": 2864200, "step": 15025 }, { "epoch": 7.8118503118503115, "grad_norm": 0.4066125750541687, "learning_rate": 3.820602916324053e-05, "loss": 0.1843, "num_input_tokens_seen": 2865128, "step": 15030 }, { "epoch": 7.8144490644490645, "grad_norm": 0.21257247030735016, "learning_rate": 3.819639975642671e-05, "loss": 0.1546, "num_input_tokens_seen": 2866088, "step": 15035 }, { "epoch": 7.817047817047817, "grad_norm": 0.8507277369499207, "learning_rate": 3.8186767634798984e-05, "loss": 0.2288, "num_input_tokens_seen": 2867080, "step": 15040 }, { "epoch": 7.81964656964657, "grad_norm": 0.1465032696723938, "learning_rate": 3.817713280033892e-05, "loss": 0.1786, "num_input_tokens_seen": 2868008, "step": 15045 }, { "epoch": 7.8222453222453225, "grad_norm": 0.224019393324852, "learning_rate": 3.816749525502863e-05, "loss": 0.1367, "num_input_tokens_seen": 2869032, "step": 15050 }, { "epoch": 7.824844074844075, "grad_norm": 0.62916100025177, "learning_rate": 3.815785500085078e-05, "loss": 0.1049, "num_input_tokens_seen": 2870024, "step": 15055 }, { "epoch": 7.827442827442828, "grad_norm": 0.6263087391853333, "learning_rate": 3.81482120397886e-05, "loss": 0.1228, "num_input_tokens_seen": 2870984, "step": 15060 }, { "epoch": 7.83004158004158, "grad_norm": 0.3589317202568054, "learning_rate": 3.813856637382589e-05, "loss": 0.109, "num_input_tokens_seen": 2871976, "step": 15065 }, { "epoch": 7.832640332640333, "grad_norm": 0.275790810585022, "learning_rate": 3.812891800494699e-05, "loss": 0.0906, "num_input_tokens_seen": 2872904, "step": 15070 }, { "epoch": 7.835239085239085, "grad_norm": 1.079853892326355, "learning_rate": 3.811926693513679e-05, "loss": 0.0694, "num_input_tokens_seen": 2873832, "step": 15075 }, { "epoch": 7.837837837837838, "grad_norm": 0.37589240074157715, "learning_rate": 3.8109613166380745e-05, "loss": 0.0359, "num_input_tokens_seen": 2874792, "step": 15080 }, { "epoch": 7.840436590436591, "grad_norm": 0.07559923082590103, "learning_rate": 3.809995670066487e-05, "loss": 0.1292, "num_input_tokens_seen": 2875752, "step": 15085 }, { "epoch": 7.843035343035343, "grad_norm": 0.4307945966720581, "learning_rate": 3.809029753997574e-05, "loss": 0.1132, "num_input_tokens_seen": 2876712, "step": 15090 }, { "epoch": 7.845634095634096, "grad_norm": 0.6449285745620728, "learning_rate": 3.8080635686300455e-05, "loss": 0.1706, "num_input_tokens_seen": 2877608, "step": 15095 }, { "epoch": 7.848232848232849, "grad_norm": 0.3956112265586853, "learning_rate": 3.8070971141626696e-05, "loss": 0.1324, "num_input_tokens_seen": 2878536, "step": 15100 }, { "epoch": 7.850831600831601, "grad_norm": 0.18894918262958527, "learning_rate": 3.80613039079427e-05, "loss": 0.0885, "num_input_tokens_seen": 2879528, "step": 15105 }, { "epoch": 7.853430353430354, "grad_norm": 0.7253974676132202, "learning_rate": 3.805163398723724e-05, "loss": 0.1569, "num_input_tokens_seen": 2880488, "step": 15110 }, { "epoch": 7.856029106029106, "grad_norm": 1.3714603185653687, "learning_rate": 3.804196138149965e-05, "loss": 0.1392, "num_input_tokens_seen": 2881480, "step": 15115 }, { "epoch": 7.858627858627859, "grad_norm": 1.1738866567611694, "learning_rate": 3.8032286092719804e-05, "loss": 0.0782, "num_input_tokens_seen": 2882376, "step": 15120 }, { "epoch": 7.861226611226611, "grad_norm": 1.0036559104919434, "learning_rate": 3.8022608122888156e-05, "loss": 0.1318, "num_input_tokens_seen": 2883400, "step": 15125 }, { "epoch": 7.863825363825364, "grad_norm": 0.09299188107252121, "learning_rate": 3.8012927473995684e-05, "loss": 0.1001, "num_input_tokens_seen": 2884264, "step": 15130 }, { "epoch": 7.866424116424117, "grad_norm": 0.44893836975097656, "learning_rate": 3.800324414803395e-05, "loss": 0.1534, "num_input_tokens_seen": 2885192, "step": 15135 }, { "epoch": 7.869022869022869, "grad_norm": 0.41137629747390747, "learning_rate": 3.799355814699501e-05, "loss": 0.0782, "num_input_tokens_seen": 2886088, "step": 15140 }, { "epoch": 7.871621621621622, "grad_norm": 0.3431912958621979, "learning_rate": 3.798386947287153e-05, "loss": 0.1225, "num_input_tokens_seen": 2887080, "step": 15145 }, { "epoch": 7.874220374220374, "grad_norm": 0.11940134316682816, "learning_rate": 3.797417812765669e-05, "loss": 0.1017, "num_input_tokens_seen": 2888040, "step": 15150 }, { "epoch": 7.876819126819127, "grad_norm": 0.6004114747047424, "learning_rate": 3.796448411334424e-05, "loss": 0.1699, "num_input_tokens_seen": 2888968, "step": 15155 }, { "epoch": 7.879417879417879, "grad_norm": 0.8111090660095215, "learning_rate": 3.795478743192846e-05, "loss": 0.0899, "num_input_tokens_seen": 2889960, "step": 15160 }, { "epoch": 7.882016632016632, "grad_norm": 0.530095636844635, "learning_rate": 3.7945088085404195e-05, "loss": 0.2383, "num_input_tokens_seen": 2891016, "step": 15165 }, { "epoch": 7.884615384615385, "grad_norm": 0.45212820172309875, "learning_rate": 3.793538607576684e-05, "loss": 0.1388, "num_input_tokens_seen": 2891976, "step": 15170 }, { "epoch": 7.887214137214137, "grad_norm": 1.2920843362808228, "learning_rate": 3.7925681405012303e-05, "loss": 0.215, "num_input_tokens_seen": 2892872, "step": 15175 }, { "epoch": 7.88981288981289, "grad_norm": 0.2462465912103653, "learning_rate": 3.79159740751371e-05, "loss": 0.0504, "num_input_tokens_seen": 2893768, "step": 15180 }, { "epoch": 7.892411642411642, "grad_norm": 1.045357584953308, "learning_rate": 3.790626408813822e-05, "loss": 0.1638, "num_input_tokens_seen": 2894728, "step": 15185 }, { "epoch": 7.895010395010395, "grad_norm": 1.2282029390335083, "learning_rate": 3.7896551446013274e-05, "loss": 0.1951, "num_input_tokens_seen": 2895656, "step": 15190 }, { "epoch": 7.897609147609147, "grad_norm": 1.6843966245651245, "learning_rate": 3.788683615076037e-05, "loss": 0.2291, "num_input_tokens_seen": 2896584, "step": 15195 }, { "epoch": 7.9002079002079, "grad_norm": 0.5006508231163025, "learning_rate": 3.7877118204378174e-05, "loss": 0.1135, "num_input_tokens_seen": 2897512, "step": 15200 }, { "epoch": 7.902806652806653, "grad_norm": 0.19019857048988342, "learning_rate": 3.7867397608865905e-05, "loss": 0.1333, "num_input_tokens_seen": 2898536, "step": 15205 }, { "epoch": 7.905405405405405, "grad_norm": 1.390690803527832, "learning_rate": 3.785767436622332e-05, "loss": 0.2655, "num_input_tokens_seen": 2899464, "step": 15210 }, { "epoch": 7.908004158004158, "grad_norm": 0.5633550882339478, "learning_rate": 3.784794847845072e-05, "loss": 0.0868, "num_input_tokens_seen": 2900424, "step": 15215 }, { "epoch": 7.91060291060291, "grad_norm": 0.4608485996723175, "learning_rate": 3.783821994754895e-05, "loss": 0.0936, "num_input_tokens_seen": 2901416, "step": 15220 }, { "epoch": 7.913201663201663, "grad_norm": 0.8595554232597351, "learning_rate": 3.78284887755194e-05, "loss": 0.1481, "num_input_tokens_seen": 2902376, "step": 15225 }, { "epoch": 7.915800415800415, "grad_norm": 0.19355520606040955, "learning_rate": 3.7818754964364004e-05, "loss": 0.1929, "num_input_tokens_seen": 2903336, "step": 15230 }, { "epoch": 7.918399168399168, "grad_norm": 1.0639231204986572, "learning_rate": 3.780901851608525e-05, "loss": 0.0904, "num_input_tokens_seen": 2904328, "step": 15235 }, { "epoch": 7.920997920997921, "grad_norm": 0.15170513093471527, "learning_rate": 3.779927943268614e-05, "loss": 0.0509, "num_input_tokens_seen": 2905288, "step": 15240 }, { "epoch": 7.923596673596673, "grad_norm": 0.2814210057258606, "learning_rate": 3.7789537716170256e-05, "loss": 0.0863, "num_input_tokens_seen": 2906248, "step": 15245 }, { "epoch": 7.926195426195426, "grad_norm": 1.2968370914459229, "learning_rate": 3.777979336854168e-05, "loss": 0.1938, "num_input_tokens_seen": 2907176, "step": 15250 }, { "epoch": 7.9287941787941785, "grad_norm": 0.47093772888183594, "learning_rate": 3.777004639180506e-05, "loss": 0.2653, "num_input_tokens_seen": 2908200, "step": 15255 }, { "epoch": 7.9313929313929314, "grad_norm": 1.4993364810943604, "learning_rate": 3.77602967879656e-05, "loss": 0.3296, "num_input_tokens_seen": 2909000, "step": 15260 }, { "epoch": 7.9339916839916835, "grad_norm": 0.2747899293899536, "learning_rate": 3.775054455902901e-05, "loss": 0.1194, "num_input_tokens_seen": 2910024, "step": 15265 }, { "epoch": 7.9365904365904365, "grad_norm": 0.5319184064865112, "learning_rate": 3.7740789707001555e-05, "loss": 0.1649, "num_input_tokens_seen": 2910984, "step": 15270 }, { "epoch": 7.9391891891891895, "grad_norm": 0.5489887595176697, "learning_rate": 3.773103223389005e-05, "loss": 0.2164, "num_input_tokens_seen": 2911880, "step": 15275 }, { "epoch": 7.941787941787942, "grad_norm": 0.9241762757301331, "learning_rate": 3.7721272141701827e-05, "loss": 0.1375, "num_input_tokens_seen": 2912904, "step": 15280 }, { "epoch": 7.9443866943866945, "grad_norm": 0.5749831795692444, "learning_rate": 3.771150943244479e-05, "loss": 0.1352, "num_input_tokens_seen": 2913864, "step": 15285 }, { "epoch": 7.946985446985447, "grad_norm": 0.5678104162216187, "learning_rate": 3.770174410812733e-05, "loss": 0.1581, "num_input_tokens_seen": 2914824, "step": 15290 }, { "epoch": 7.9495841995842, "grad_norm": 0.8776115775108337, "learning_rate": 3.769197617075844e-05, "loss": 0.1747, "num_input_tokens_seen": 2915816, "step": 15295 }, { "epoch": 7.952182952182953, "grad_norm": 0.663341760635376, "learning_rate": 3.76822056223476e-05, "loss": 0.1673, "num_input_tokens_seen": 2916776, "step": 15300 }, { "epoch": 7.954781704781705, "grad_norm": 0.9818020462989807, "learning_rate": 3.767243246490484e-05, "loss": 0.1937, "num_input_tokens_seen": 2917672, "step": 15305 }, { "epoch": 7.957380457380458, "grad_norm": 0.3233587145805359, "learning_rate": 3.7662656700440744e-05, "loss": 0.1091, "num_input_tokens_seen": 2918664, "step": 15310 }, { "epoch": 7.95997920997921, "grad_norm": 0.3882543444633484, "learning_rate": 3.7652878330966415e-05, "loss": 0.0795, "num_input_tokens_seen": 2919656, "step": 15315 }, { "epoch": 7.962577962577963, "grad_norm": 0.24800479412078857, "learning_rate": 3.7643097358493485e-05, "loss": 0.07, "num_input_tokens_seen": 2920616, "step": 15320 }, { "epoch": 7.965176715176716, "grad_norm": 0.5115184783935547, "learning_rate": 3.763331378503415e-05, "loss": 0.1031, "num_input_tokens_seen": 2921512, "step": 15325 }, { "epoch": 7.967775467775468, "grad_norm": 0.8052148818969727, "learning_rate": 3.762352761260112e-05, "loss": 0.0977, "num_input_tokens_seen": 2922472, "step": 15330 }, { "epoch": 7.970374220374221, "grad_norm": 0.5045651197433472, "learning_rate": 3.7613738843207644e-05, "loss": 0.068, "num_input_tokens_seen": 2923368, "step": 15335 }, { "epoch": 7.972972972972973, "grad_norm": 0.7595815062522888, "learning_rate": 3.76039474788675e-05, "loss": 0.145, "num_input_tokens_seen": 2924392, "step": 15340 }, { "epoch": 7.975571725571726, "grad_norm": 0.5873652696609497, "learning_rate": 3.7594153521595e-05, "loss": 0.1807, "num_input_tokens_seen": 2925352, "step": 15345 }, { "epoch": 7.978170478170478, "grad_norm": 0.37759289145469666, "learning_rate": 3.7584356973405017e-05, "loss": 0.1354, "num_input_tokens_seen": 2926376, "step": 15350 }, { "epoch": 7.980769230769231, "grad_norm": 0.4794398844242096, "learning_rate": 3.7574557836312905e-05, "loss": 0.068, "num_input_tokens_seen": 2927272, "step": 15355 }, { "epoch": 7.983367983367984, "grad_norm": 0.7348859906196594, "learning_rate": 3.756475611233461e-05, "loss": 0.1868, "num_input_tokens_seen": 2928232, "step": 15360 }, { "epoch": 7.985966735966736, "grad_norm": 0.9130607843399048, "learning_rate": 3.755495180348656e-05, "loss": 0.2417, "num_input_tokens_seen": 2929192, "step": 15365 }, { "epoch": 7.988565488565489, "grad_norm": 1.1514116525650024, "learning_rate": 3.754514491178574e-05, "loss": 0.1452, "num_input_tokens_seen": 2930120, "step": 15370 }, { "epoch": 7.991164241164241, "grad_norm": 0.3003890812397003, "learning_rate": 3.753533543924965e-05, "loss": 0.0947, "num_input_tokens_seen": 2931048, "step": 15375 }, { "epoch": 7.993762993762994, "grad_norm": 0.31925442814826965, "learning_rate": 3.752552338789635e-05, "loss": 0.1541, "num_input_tokens_seen": 2931944, "step": 15380 }, { "epoch": 7.996361746361746, "grad_norm": 0.9186578392982483, "learning_rate": 3.751570875974441e-05, "loss": 0.2186, "num_input_tokens_seen": 2933000, "step": 15385 }, { "epoch": 7.998960498960499, "grad_norm": 1.1918854713439941, "learning_rate": 3.750589155681292e-05, "loss": 0.1344, "num_input_tokens_seen": 2934024, "step": 15390 }, { "epoch": 8.0, "eval_loss": 0.14882275462150574, "eval_runtime": 8.0232, "eval_samples_per_second": 106.691, "eval_steps_per_second": 26.673, "num_input_tokens_seen": 2934360, "step": 15392 }, { "epoch": 8.001559251559252, "grad_norm": 0.14071990549564362, "learning_rate": 3.7496071781121535e-05, "loss": 0.1038, "num_input_tokens_seen": 2935000, "step": 15395 }, { "epoch": 8.004158004158004, "grad_norm": 0.6877288222312927, "learning_rate": 3.7486249434690405e-05, "loss": 0.1459, "num_input_tokens_seen": 2935960, "step": 15400 }, { "epoch": 8.006756756756756, "grad_norm": 0.3846376836299896, "learning_rate": 3.7476424519540206e-05, "loss": 0.1682, "num_input_tokens_seen": 2936920, "step": 15405 }, { "epoch": 8.00935550935551, "grad_norm": 0.12658022344112396, "learning_rate": 3.7466597037692177e-05, "loss": 0.0941, "num_input_tokens_seen": 2937816, "step": 15410 }, { "epoch": 8.011954261954262, "grad_norm": 0.3567568361759186, "learning_rate": 3.7456766991168055e-05, "loss": 0.0949, "num_input_tokens_seen": 2938808, "step": 15415 }, { "epoch": 8.014553014553014, "grad_norm": 0.5153211951255798, "learning_rate": 3.744693438199012e-05, "loss": 0.0398, "num_input_tokens_seen": 2939704, "step": 15420 }, { "epoch": 8.017151767151768, "grad_norm": 0.9763549566268921, "learning_rate": 3.7437099212181176e-05, "loss": 0.1989, "num_input_tokens_seen": 2940664, "step": 15425 }, { "epoch": 8.01975051975052, "grad_norm": 0.6757123470306396, "learning_rate": 3.742726148376455e-05, "loss": 0.1217, "num_input_tokens_seen": 2941624, "step": 15430 }, { "epoch": 8.022349272349272, "grad_norm": 0.8364933133125305, "learning_rate": 3.741742119876409e-05, "loss": 0.1231, "num_input_tokens_seen": 2942648, "step": 15435 }, { "epoch": 8.024948024948024, "grad_norm": 0.17699910700321198, "learning_rate": 3.7407578359204187e-05, "loss": 0.1511, "num_input_tokens_seen": 2943576, "step": 15440 }, { "epoch": 8.027546777546778, "grad_norm": 0.28388944268226624, "learning_rate": 3.739773296710974e-05, "loss": 0.0879, "num_input_tokens_seen": 2944536, "step": 15445 }, { "epoch": 8.03014553014553, "grad_norm": 0.6883423328399658, "learning_rate": 3.738788502450618e-05, "loss": 0.1222, "num_input_tokens_seen": 2945560, "step": 15450 }, { "epoch": 8.032744282744282, "grad_norm": 0.8663184642791748, "learning_rate": 3.737803453341947e-05, "loss": 0.0696, "num_input_tokens_seen": 2946488, "step": 15455 }, { "epoch": 8.035343035343036, "grad_norm": 0.4655107855796814, "learning_rate": 3.73681814958761e-05, "loss": 0.1005, "num_input_tokens_seen": 2947448, "step": 15460 }, { "epoch": 8.037941787941788, "grad_norm": 1.7780874967575073, "learning_rate": 3.7358325913903054e-05, "loss": 0.2096, "num_input_tokens_seen": 2948408, "step": 15465 }, { "epoch": 8.04054054054054, "grad_norm": 0.6902502775192261, "learning_rate": 3.7348467789527867e-05, "loss": 0.084, "num_input_tokens_seen": 2949336, "step": 15470 }, { "epoch": 8.043139293139292, "grad_norm": 1.3020355701446533, "learning_rate": 3.733860712477859e-05, "loss": 0.1983, "num_input_tokens_seen": 2950328, "step": 15475 }, { "epoch": 8.045738045738046, "grad_norm": 0.2820797562599182, "learning_rate": 3.732874392168381e-05, "loss": 0.125, "num_input_tokens_seen": 2951320, "step": 15480 }, { "epoch": 8.048336798336798, "grad_norm": 0.8036869764328003, "learning_rate": 3.73188781822726e-05, "loss": 0.3975, "num_input_tokens_seen": 2952344, "step": 15485 }, { "epoch": 8.05093555093555, "grad_norm": 0.07947415113449097, "learning_rate": 3.7309009908574596e-05, "loss": 0.15, "num_input_tokens_seen": 2953304, "step": 15490 }, { "epoch": 8.053534303534304, "grad_norm": 0.7677738666534424, "learning_rate": 3.729913910261993e-05, "loss": 0.2048, "num_input_tokens_seen": 2954264, "step": 15495 }, { "epoch": 8.056133056133056, "grad_norm": 0.7842394113540649, "learning_rate": 3.728926576643926e-05, "loss": 0.2362, "num_input_tokens_seen": 2955224, "step": 15500 }, { "epoch": 8.058731808731808, "grad_norm": 1.268028974533081, "learning_rate": 3.727938990206377e-05, "loss": 0.1911, "num_input_tokens_seen": 2956152, "step": 15505 }, { "epoch": 8.06133056133056, "grad_norm": 0.5886912941932678, "learning_rate": 3.726951151152516e-05, "loss": 0.0859, "num_input_tokens_seen": 2957016, "step": 15510 }, { "epoch": 8.063929313929314, "grad_norm": 0.7786649465560913, "learning_rate": 3.725963059685564e-05, "loss": 0.1068, "num_input_tokens_seen": 2957912, "step": 15515 }, { "epoch": 8.066528066528067, "grad_norm": 0.12221981585025787, "learning_rate": 3.724974716008798e-05, "loss": 0.1324, "num_input_tokens_seen": 2958872, "step": 15520 }, { "epoch": 8.069126819126819, "grad_norm": 0.5792038440704346, "learning_rate": 3.7239861203255404e-05, "loss": 0.0866, "num_input_tokens_seen": 2959832, "step": 15525 }, { "epoch": 8.071725571725572, "grad_norm": 0.8860507011413574, "learning_rate": 3.72299727283917e-05, "loss": 0.1059, "num_input_tokens_seen": 2960728, "step": 15530 }, { "epoch": 8.074324324324325, "grad_norm": 0.6783907413482666, "learning_rate": 3.722008173753118e-05, "loss": 0.2058, "num_input_tokens_seen": 2961624, "step": 15535 }, { "epoch": 8.076923076923077, "grad_norm": 0.306918740272522, "learning_rate": 3.721018823270863e-05, "loss": 0.2409, "num_input_tokens_seen": 2962552, "step": 15540 }, { "epoch": 8.079521829521829, "grad_norm": 1.615799069404602, "learning_rate": 3.72002922159594e-05, "loss": 0.1708, "num_input_tokens_seen": 2963512, "step": 15545 }, { "epoch": 8.082120582120583, "grad_norm": 0.10474775731563568, "learning_rate": 3.719039368931932e-05, "loss": 0.1365, "num_input_tokens_seen": 2964504, "step": 15550 }, { "epoch": 8.084719334719335, "grad_norm": 1.7353715896606445, "learning_rate": 3.7180492654824765e-05, "loss": 0.1597, "num_input_tokens_seen": 2965496, "step": 15555 }, { "epoch": 8.087318087318087, "grad_norm": 0.7460533380508423, "learning_rate": 3.7170589114512616e-05, "loss": 0.1245, "num_input_tokens_seen": 2966456, "step": 15560 }, { "epoch": 8.08991683991684, "grad_norm": 0.6002745032310486, "learning_rate": 3.716068307042026e-05, "loss": 0.1959, "num_input_tokens_seen": 2967352, "step": 15565 }, { "epoch": 8.092515592515593, "grad_norm": 0.1508852243423462, "learning_rate": 3.71507745245856e-05, "loss": 0.2392, "num_input_tokens_seen": 2968312, "step": 15570 }, { "epoch": 8.095114345114345, "grad_norm": 1.06659996509552, "learning_rate": 3.7140863479047074e-05, "loss": 0.174, "num_input_tokens_seen": 2969272, "step": 15575 }, { "epoch": 8.097713097713097, "grad_norm": 0.8786914348602295, "learning_rate": 3.7130949935843614e-05, "loss": 0.1507, "num_input_tokens_seen": 2970232, "step": 15580 }, { "epoch": 8.10031185031185, "grad_norm": 0.767558753490448, "learning_rate": 3.7121033897014663e-05, "loss": 0.1603, "num_input_tokens_seen": 2971224, "step": 15585 }, { "epoch": 8.102910602910603, "grad_norm": 0.47243231534957886, "learning_rate": 3.711111536460021e-05, "loss": 0.1087, "num_input_tokens_seen": 2972248, "step": 15590 }, { "epoch": 8.105509355509355, "grad_norm": 0.8091134428977966, "learning_rate": 3.710119434064071e-05, "loss": 0.1012, "num_input_tokens_seen": 2973240, "step": 15595 }, { "epoch": 8.108108108108109, "grad_norm": 0.3565730154514313, "learning_rate": 3.7091270827177167e-05, "loss": 0.1775, "num_input_tokens_seen": 2974264, "step": 15600 }, { "epoch": 8.11070686070686, "grad_norm": 0.3039185702800751, "learning_rate": 3.7081344826251076e-05, "loss": 0.0569, "num_input_tokens_seen": 2975160, "step": 15605 }, { "epoch": 8.113305613305613, "grad_norm": 0.29022061824798584, "learning_rate": 3.707141633990446e-05, "loss": 0.1511, "num_input_tokens_seen": 2976152, "step": 15610 }, { "epoch": 8.115904365904365, "grad_norm": 0.4217926263809204, "learning_rate": 3.7061485370179835e-05, "loss": 0.085, "num_input_tokens_seen": 2977144, "step": 15615 }, { "epoch": 8.118503118503119, "grad_norm": 0.1812547743320465, "learning_rate": 3.705155191912025e-05, "loss": 0.0546, "num_input_tokens_seen": 2978072, "step": 15620 }, { "epoch": 8.121101871101871, "grad_norm": 0.6013325452804565, "learning_rate": 3.704161598876925e-05, "loss": 0.1419, "num_input_tokens_seen": 2979032, "step": 15625 }, { "epoch": 8.123700623700623, "grad_norm": 1.4714852571487427, "learning_rate": 3.7031677581170884e-05, "loss": 0.2056, "num_input_tokens_seen": 2980056, "step": 15630 }, { "epoch": 8.126299376299377, "grad_norm": 2.860658645629883, "learning_rate": 3.7021736698369724e-05, "loss": 0.148, "num_input_tokens_seen": 2981048, "step": 15635 }, { "epoch": 8.128898128898129, "grad_norm": 0.5053319931030273, "learning_rate": 3.701179334241085e-05, "loss": 0.1283, "num_input_tokens_seen": 2981944, "step": 15640 }, { "epoch": 8.131496881496881, "grad_norm": 0.704704225063324, "learning_rate": 3.700184751533985e-05, "loss": 0.1896, "num_input_tokens_seen": 2982936, "step": 15645 }, { "epoch": 8.134095634095635, "grad_norm": 1.158506155014038, "learning_rate": 3.69918992192028e-05, "loss": 0.0867, "num_input_tokens_seen": 2983864, "step": 15650 }, { "epoch": 8.136694386694387, "grad_norm": 1.1649075746536255, "learning_rate": 3.698194845604633e-05, "loss": 0.1305, "num_input_tokens_seen": 2984888, "step": 15655 }, { "epoch": 8.13929313929314, "grad_norm": 0.6411030292510986, "learning_rate": 3.697199522791752e-05, "loss": 0.1685, "num_input_tokens_seen": 2985880, "step": 15660 }, { "epoch": 8.141891891891891, "grad_norm": 0.27821221947669983, "learning_rate": 3.6962039536864006e-05, "loss": 0.1046, "num_input_tokens_seen": 2986936, "step": 15665 }, { "epoch": 8.144490644490645, "grad_norm": 1.4967211484909058, "learning_rate": 3.69520813849339e-05, "loss": 0.1223, "num_input_tokens_seen": 2987864, "step": 15670 }, { "epoch": 8.147089397089397, "grad_norm": 0.9086578488349915, "learning_rate": 3.6942120774175837e-05, "loss": 0.1456, "num_input_tokens_seen": 2988760, "step": 15675 }, { "epoch": 8.14968814968815, "grad_norm": 0.4852274954319, "learning_rate": 3.693215770663895e-05, "loss": 0.1929, "num_input_tokens_seen": 2989720, "step": 15680 }, { "epoch": 8.152286902286903, "grad_norm": 0.5057101249694824, "learning_rate": 3.6922192184372884e-05, "loss": 0.1606, "num_input_tokens_seen": 2990712, "step": 15685 }, { "epoch": 8.154885654885655, "grad_norm": 0.33947235345840454, "learning_rate": 3.691222420942778e-05, "loss": 0.0828, "num_input_tokens_seen": 2991672, "step": 15690 }, { "epoch": 8.157484407484407, "grad_norm": 0.4092788100242615, "learning_rate": 3.690225378385428e-05, "loss": 0.3023, "num_input_tokens_seen": 2992600, "step": 15695 }, { "epoch": 8.16008316008316, "grad_norm": 1.8861018419265747, "learning_rate": 3.689228090970355e-05, "loss": 0.1417, "num_input_tokens_seen": 2993528, "step": 15700 }, { "epoch": 8.162681912681913, "grad_norm": 0.48442596197128296, "learning_rate": 3.688230558902725e-05, "loss": 0.0965, "num_input_tokens_seen": 2994520, "step": 15705 }, { "epoch": 8.165280665280665, "grad_norm": 0.5469231605529785, "learning_rate": 3.687232782387753e-05, "loss": 0.1303, "num_input_tokens_seen": 2995448, "step": 15710 }, { "epoch": 8.167879417879417, "grad_norm": 0.7575369477272034, "learning_rate": 3.6862347616307064e-05, "loss": 0.118, "num_input_tokens_seen": 2996376, "step": 15715 }, { "epoch": 8.170478170478171, "grad_norm": 0.11850103735923767, "learning_rate": 3.685236496836901e-05, "loss": 0.1025, "num_input_tokens_seen": 2997400, "step": 15720 }, { "epoch": 8.173076923076923, "grad_norm": 0.8515797257423401, "learning_rate": 3.6842379882117046e-05, "loss": 0.184, "num_input_tokens_seen": 2998328, "step": 15725 }, { "epoch": 8.175675675675675, "grad_norm": 0.5811620354652405, "learning_rate": 3.6832392359605325e-05, "loss": 0.1183, "num_input_tokens_seen": 2999288, "step": 15730 }, { "epoch": 8.178274428274428, "grad_norm": 0.3180166184902191, "learning_rate": 3.682240240288854e-05, "loss": 0.0571, "num_input_tokens_seen": 3000280, "step": 15735 }, { "epoch": 8.180873180873181, "grad_norm": 0.7258965969085693, "learning_rate": 3.6812410014021845e-05, "loss": 0.1896, "num_input_tokens_seen": 3001176, "step": 15740 }, { "epoch": 8.183471933471933, "grad_norm": 1.3966097831726074, "learning_rate": 3.6802415195060927e-05, "loss": 0.1899, "num_input_tokens_seen": 3002136, "step": 15745 }, { "epoch": 8.186070686070686, "grad_norm": 0.48990651965141296, "learning_rate": 3.679241794806195e-05, "loss": 0.0633, "num_input_tokens_seen": 3003096, "step": 15750 }, { "epoch": 8.18866943866944, "grad_norm": 0.8177563548088074, "learning_rate": 3.678241827508159e-05, "loss": 0.1681, "num_input_tokens_seen": 3004088, "step": 15755 }, { "epoch": 8.191268191268192, "grad_norm": 0.6740220785140991, "learning_rate": 3.6772416178177024e-05, "loss": 0.0934, "num_input_tokens_seen": 3005048, "step": 15760 }, { "epoch": 8.193866943866944, "grad_norm": 0.12047596275806427, "learning_rate": 3.6762411659405895e-05, "loss": 0.0754, "num_input_tokens_seen": 3006008, "step": 15765 }, { "epoch": 8.196465696465696, "grad_norm": 0.8178247213363647, "learning_rate": 3.675240472082641e-05, "loss": 0.1553, "num_input_tokens_seen": 3006904, "step": 15770 }, { "epoch": 8.19906444906445, "grad_norm": 0.437473326921463, "learning_rate": 3.674239536449719e-05, "loss": 0.0626, "num_input_tokens_seen": 3007864, "step": 15775 }, { "epoch": 8.201663201663202, "grad_norm": 1.0437990427017212, "learning_rate": 3.6732383592477443e-05, "loss": 0.1202, "num_input_tokens_seen": 3008824, "step": 15780 }, { "epoch": 8.204261954261954, "grad_norm": 1.0861577987670898, "learning_rate": 3.67223694068268e-05, "loss": 0.1295, "num_input_tokens_seen": 3009816, "step": 15785 }, { "epoch": 8.206860706860708, "grad_norm": 0.7724784016609192, "learning_rate": 3.6712352809605434e-05, "loss": 0.1393, "num_input_tokens_seen": 3010744, "step": 15790 }, { "epoch": 8.20945945945946, "grad_norm": 0.3562014102935791, "learning_rate": 3.670233380287398e-05, "loss": 0.1692, "num_input_tokens_seen": 3011704, "step": 15795 }, { "epoch": 8.212058212058212, "grad_norm": 0.7499366402626038, "learning_rate": 3.669231238869361e-05, "loss": 0.1314, "num_input_tokens_seen": 3012568, "step": 15800 }, { "epoch": 8.214656964656964, "grad_norm": 0.8800510168075562, "learning_rate": 3.668228856912594e-05, "loss": 0.225, "num_input_tokens_seen": 3013528, "step": 15805 }, { "epoch": 8.217255717255718, "grad_norm": 1.4735287427902222, "learning_rate": 3.667226234623313e-05, "loss": 0.1894, "num_input_tokens_seen": 3014488, "step": 15810 }, { "epoch": 8.21985446985447, "grad_norm": 0.5035913586616516, "learning_rate": 3.66622337220778e-05, "loss": 0.1185, "num_input_tokens_seen": 3015416, "step": 15815 }, { "epoch": 8.222453222453222, "grad_norm": 0.1525328904390335, "learning_rate": 3.6652202698723096e-05, "loss": 0.0764, "num_input_tokens_seen": 3016408, "step": 15820 }, { "epoch": 8.225051975051976, "grad_norm": 0.8961893916130066, "learning_rate": 3.6642169278232605e-05, "loss": 0.1263, "num_input_tokens_seen": 3017304, "step": 15825 }, { "epoch": 8.227650727650728, "grad_norm": 0.8265581727027893, "learning_rate": 3.663213346267048e-05, "loss": 0.091, "num_input_tokens_seen": 3018296, "step": 15830 }, { "epoch": 8.23024948024948, "grad_norm": 1.2292494773864746, "learning_rate": 3.6622095254101294e-05, "loss": 0.1239, "num_input_tokens_seen": 3019288, "step": 15835 }, { "epoch": 8.232848232848234, "grad_norm": 0.6958751678466797, "learning_rate": 3.661205465459017e-05, "loss": 0.1073, "num_input_tokens_seen": 3020216, "step": 15840 }, { "epoch": 8.235446985446986, "grad_norm": 0.3837352395057678, "learning_rate": 3.660201166620268e-05, "loss": 0.0957, "num_input_tokens_seen": 3021144, "step": 15845 }, { "epoch": 8.238045738045738, "grad_norm": 0.2097836136817932, "learning_rate": 3.659196629100492e-05, "loss": 0.2147, "num_input_tokens_seen": 3022072, "step": 15850 }, { "epoch": 8.24064449064449, "grad_norm": 0.6201598644256592, "learning_rate": 3.6581918531063466e-05, "loss": 0.1266, "num_input_tokens_seen": 3023096, "step": 15855 }, { "epoch": 8.243243243243244, "grad_norm": 0.08575820177793503, "learning_rate": 3.657186838844536e-05, "loss": 0.1383, "num_input_tokens_seen": 3024120, "step": 15860 }, { "epoch": 8.245841995841996, "grad_norm": 0.1872101128101349, "learning_rate": 3.6561815865218176e-05, "loss": 0.0383, "num_input_tokens_seen": 3025016, "step": 15865 }, { "epoch": 8.248440748440748, "grad_norm": 0.8413406014442444, "learning_rate": 3.6551760963449943e-05, "loss": 0.1951, "num_input_tokens_seen": 3025944, "step": 15870 }, { "epoch": 8.2510395010395, "grad_norm": 0.9280444383621216, "learning_rate": 3.6541703685209204e-05, "loss": 0.1332, "num_input_tokens_seen": 3027000, "step": 15875 }, { "epoch": 8.253638253638254, "grad_norm": 0.37834975123405457, "learning_rate": 3.653164403256498e-05, "loss": 0.1532, "num_input_tokens_seen": 3027928, "step": 15880 }, { "epoch": 8.256237006237006, "grad_norm": 0.3004954755306244, "learning_rate": 3.652158200758679e-05, "loss": 0.1243, "num_input_tokens_seen": 3028920, "step": 15885 }, { "epoch": 8.258835758835758, "grad_norm": 1.1086068153381348, "learning_rate": 3.651151761234461e-05, "loss": 0.22, "num_input_tokens_seen": 3029784, "step": 15890 }, { "epoch": 8.261434511434512, "grad_norm": 0.7162176966667175, "learning_rate": 3.650145084890895e-05, "loss": 0.1908, "num_input_tokens_seen": 3030744, "step": 15895 }, { "epoch": 8.264033264033264, "grad_norm": 0.694925844669342, "learning_rate": 3.6491381719350756e-05, "loss": 0.1758, "num_input_tokens_seen": 3031704, "step": 15900 }, { "epoch": 8.266632016632016, "grad_norm": 0.08417994529008865, "learning_rate": 3.648131022574151e-05, "loss": 0.1701, "num_input_tokens_seen": 3032632, "step": 15905 }, { "epoch": 8.26923076923077, "grad_norm": 0.6227428913116455, "learning_rate": 3.647123637015315e-05, "loss": 0.1633, "num_input_tokens_seen": 3033656, "step": 15910 }, { "epoch": 8.271829521829522, "grad_norm": 0.7860710024833679, "learning_rate": 3.646116015465811e-05, "loss": 0.064, "num_input_tokens_seen": 3034584, "step": 15915 }, { "epoch": 8.274428274428274, "grad_norm": 0.1550523042678833, "learning_rate": 3.645108158132932e-05, "loss": 0.1053, "num_input_tokens_seen": 3035576, "step": 15920 }, { "epoch": 8.277027027027026, "grad_norm": 0.7632091045379639, "learning_rate": 3.644100065224015e-05, "loss": 0.0976, "num_input_tokens_seen": 3036568, "step": 15925 }, { "epoch": 8.27962577962578, "grad_norm": 0.17164327204227448, "learning_rate": 3.643091736946451e-05, "loss": 0.1154, "num_input_tokens_seen": 3037496, "step": 15930 }, { "epoch": 8.282224532224532, "grad_norm": 1.1819231510162354, "learning_rate": 3.6420831735076774e-05, "loss": 0.0838, "num_input_tokens_seen": 3038392, "step": 15935 }, { "epoch": 8.284823284823284, "grad_norm": 0.7616673111915588, "learning_rate": 3.641074375115179e-05, "loss": 0.1413, "num_input_tokens_seen": 3039352, "step": 15940 }, { "epoch": 8.287422037422038, "grad_norm": 0.7517755627632141, "learning_rate": 3.64006534197649e-05, "loss": 0.1882, "num_input_tokens_seen": 3040408, "step": 15945 }, { "epoch": 8.29002079002079, "grad_norm": 0.20965120196342468, "learning_rate": 3.639056074299193e-05, "loss": 0.1305, "num_input_tokens_seen": 3041400, "step": 15950 }, { "epoch": 8.292619542619542, "grad_norm": 0.08603379875421524, "learning_rate": 3.638046572290917e-05, "loss": 0.1221, "num_input_tokens_seen": 3042328, "step": 15955 }, { "epoch": 8.295218295218294, "grad_norm": 0.3015754818916321, "learning_rate": 3.637036836159342e-05, "loss": 0.0677, "num_input_tokens_seen": 3043320, "step": 15960 }, { "epoch": 8.297817047817048, "grad_norm": 0.38909435272216797, "learning_rate": 3.636026866112194e-05, "loss": 0.1503, "num_input_tokens_seen": 3044312, "step": 15965 }, { "epoch": 8.3004158004158, "grad_norm": 0.806673526763916, "learning_rate": 3.635016662357248e-05, "loss": 0.089, "num_input_tokens_seen": 3045208, "step": 15970 }, { "epoch": 8.303014553014552, "grad_norm": 0.43238013982772827, "learning_rate": 3.634006225102327e-05, "loss": 0.1067, "num_input_tokens_seen": 3046200, "step": 15975 }, { "epoch": 8.305613305613306, "grad_norm": 0.9741585850715637, "learning_rate": 3.6329955545553026e-05, "loss": 0.1395, "num_input_tokens_seen": 3047096, "step": 15980 }, { "epoch": 8.308212058212058, "grad_norm": 0.763394296169281, "learning_rate": 3.631984650924094e-05, "loss": 0.1717, "num_input_tokens_seen": 3048152, "step": 15985 }, { "epoch": 8.31081081081081, "grad_norm": 0.1607009470462799, "learning_rate": 3.630973514416666e-05, "loss": 0.1726, "num_input_tokens_seen": 3049112, "step": 15990 }, { "epoch": 8.313409563409563, "grad_norm": 0.18369953334331512, "learning_rate": 3.629962145241036e-05, "loss": 0.1647, "num_input_tokens_seen": 3050168, "step": 15995 }, { "epoch": 8.316008316008316, "grad_norm": 0.52928626537323, "learning_rate": 3.6289505436052655e-05, "loss": 0.0804, "num_input_tokens_seen": 3051160, "step": 16000 }, { "epoch": 8.318607068607069, "grad_norm": 0.18779706954956055, "learning_rate": 3.627938709717465e-05, "loss": 0.0338, "num_input_tokens_seen": 3052088, "step": 16005 }, { "epoch": 8.32120582120582, "grad_norm": 0.10675255209207535, "learning_rate": 3.626926643785794e-05, "loss": 0.0562, "num_input_tokens_seen": 3053016, "step": 16010 }, { "epoch": 8.323804573804575, "grad_norm": 0.5955812931060791, "learning_rate": 3.625914346018457e-05, "loss": 0.1181, "num_input_tokens_seen": 3053912, "step": 16015 }, { "epoch": 8.326403326403327, "grad_norm": 0.6444712281227112, "learning_rate": 3.6249018166237094e-05, "loss": 0.0772, "num_input_tokens_seen": 3054872, "step": 16020 }, { "epoch": 8.329002079002079, "grad_norm": 0.3938800096511841, "learning_rate": 3.623889055809851e-05, "loss": 0.0843, "num_input_tokens_seen": 3055800, "step": 16025 }, { "epoch": 8.33160083160083, "grad_norm": 0.1517297476530075, "learning_rate": 3.622876063785232e-05, "loss": 0.0971, "num_input_tokens_seen": 3056696, "step": 16030 }, { "epoch": 8.334199584199585, "grad_norm": 1.034771203994751, "learning_rate": 3.621862840758248e-05, "loss": 0.1363, "num_input_tokens_seen": 3057656, "step": 16035 }, { "epoch": 8.336798336798337, "grad_norm": 0.2589738368988037, "learning_rate": 3.620849386937344e-05, "loss": 0.0854, "num_input_tokens_seen": 3058584, "step": 16040 }, { "epoch": 8.339397089397089, "grad_norm": 0.26988017559051514, "learning_rate": 3.619835702531011e-05, "loss": 0.1098, "num_input_tokens_seen": 3059480, "step": 16045 }, { "epoch": 8.341995841995843, "grad_norm": 0.40284380316734314, "learning_rate": 3.6188217877477886e-05, "loss": 0.0815, "num_input_tokens_seen": 3060472, "step": 16050 }, { "epoch": 8.344594594594595, "grad_norm": 0.1807190626859665, "learning_rate": 3.617807642796264e-05, "loss": 0.1843, "num_input_tokens_seen": 3061464, "step": 16055 }, { "epoch": 8.347193347193347, "grad_norm": 0.5035474300384521, "learning_rate": 3.616793267885068e-05, "loss": 0.278, "num_input_tokens_seen": 3062520, "step": 16060 }, { "epoch": 8.3497920997921, "grad_norm": 0.6601449847221375, "learning_rate": 3.615778663222885e-05, "loss": 0.1541, "num_input_tokens_seen": 3063480, "step": 16065 }, { "epoch": 8.352390852390853, "grad_norm": 0.2723184823989868, "learning_rate": 3.614763829018441e-05, "loss": 0.2543, "num_input_tokens_seen": 3064376, "step": 16070 }, { "epoch": 8.354989604989605, "grad_norm": 0.038343314081430435, "learning_rate": 3.613748765480514e-05, "loss": 0.2066, "num_input_tokens_seen": 3065272, "step": 16075 }, { "epoch": 8.357588357588357, "grad_norm": 0.898209810256958, "learning_rate": 3.612733472817924e-05, "loss": 0.152, "num_input_tokens_seen": 3066200, "step": 16080 }, { "epoch": 8.36018711018711, "grad_norm": 0.6298472881317139, "learning_rate": 3.611717951239543e-05, "loss": 0.0762, "num_input_tokens_seen": 3067256, "step": 16085 }, { "epoch": 8.362785862785863, "grad_norm": 0.8748336434364319, "learning_rate": 3.610702200954287e-05, "loss": 0.1389, "num_input_tokens_seen": 3068248, "step": 16090 }, { "epoch": 8.365384615384615, "grad_norm": 1.0730067491531372, "learning_rate": 3.6096862221711205e-05, "loss": 0.1985, "num_input_tokens_seen": 3069176, "step": 16095 }, { "epoch": 8.367983367983369, "grad_norm": 0.14151613414287567, "learning_rate": 3.6086700150990536e-05, "loss": 0.0737, "num_input_tokens_seen": 3070168, "step": 16100 }, { "epoch": 8.370582120582121, "grad_norm": 0.1673322468996048, "learning_rate": 3.607653579947145e-05, "loss": 0.0567, "num_input_tokens_seen": 3071128, "step": 16105 }, { "epoch": 8.373180873180873, "grad_norm": 1.1378769874572754, "learning_rate": 3.606636916924501e-05, "loss": 0.2225, "num_input_tokens_seen": 3072184, "step": 16110 }, { "epoch": 8.375779625779625, "grad_norm": 1.1365731954574585, "learning_rate": 3.605620026240271e-05, "loss": 0.0898, "num_input_tokens_seen": 3073080, "step": 16115 }, { "epoch": 8.378378378378379, "grad_norm": 0.20401707291603088, "learning_rate": 3.6046029081036556e-05, "loss": 0.2697, "num_input_tokens_seen": 3074040, "step": 16120 }, { "epoch": 8.380977130977131, "grad_norm": 0.5578204989433289, "learning_rate": 3.603585562723898e-05, "loss": 0.1547, "num_input_tokens_seen": 3074968, "step": 16125 }, { "epoch": 8.383575883575883, "grad_norm": 0.31601113080978394, "learning_rate": 3.602567990310292e-05, "loss": 0.0872, "num_input_tokens_seen": 3075864, "step": 16130 }, { "epoch": 8.386174636174637, "grad_norm": 0.8348582983016968, "learning_rate": 3.601550191072177e-05, "loss": 0.0962, "num_input_tokens_seen": 3076760, "step": 16135 }, { "epoch": 8.388773388773389, "grad_norm": 0.3184020221233368, "learning_rate": 3.600532165218937e-05, "loss": 0.1066, "num_input_tokens_seen": 3077816, "step": 16140 }, { "epoch": 8.391372141372141, "grad_norm": 0.9728760123252869, "learning_rate": 3.5995139129600056e-05, "loss": 0.1894, "num_input_tokens_seen": 3078776, "step": 16145 }, { "epoch": 8.393970893970893, "grad_norm": 0.8065600991249084, "learning_rate": 3.598495434504861e-05, "loss": 0.1401, "num_input_tokens_seen": 3079672, "step": 16150 }, { "epoch": 8.396569646569647, "grad_norm": 0.1658698171377182, "learning_rate": 3.5974767300630285e-05, "loss": 0.141, "num_input_tokens_seen": 3080664, "step": 16155 }, { "epoch": 8.3991683991684, "grad_norm": 0.5397520065307617, "learning_rate": 3.596457799844079e-05, "loss": 0.1089, "num_input_tokens_seen": 3081688, "step": 16160 }, { "epoch": 8.401767151767151, "grad_norm": 0.4332846403121948, "learning_rate": 3.595438644057632e-05, "loss": 0.2181, "num_input_tokens_seen": 3082616, "step": 16165 }, { "epoch": 8.404365904365905, "grad_norm": 0.3870943486690521, "learning_rate": 3.594419262913351e-05, "loss": 0.0932, "num_input_tokens_seen": 3083544, "step": 16170 }, { "epoch": 8.406964656964657, "grad_norm": 0.3094511032104492, "learning_rate": 3.593399656620949e-05, "loss": 0.0814, "num_input_tokens_seen": 3084568, "step": 16175 }, { "epoch": 8.40956340956341, "grad_norm": 1.308837652206421, "learning_rate": 3.592379825390181e-05, "loss": 0.231, "num_input_tokens_seen": 3085560, "step": 16180 }, { "epoch": 8.412162162162161, "grad_norm": 0.2800491452217102, "learning_rate": 3.5913597694308515e-05, "loss": 0.0953, "num_input_tokens_seen": 3086488, "step": 16185 }, { "epoch": 8.414760914760915, "grad_norm": 0.21066221594810486, "learning_rate": 3.5903394889528105e-05, "loss": 0.128, "num_input_tokens_seen": 3087416, "step": 16190 }, { "epoch": 8.417359667359667, "grad_norm": 0.06709324568510056, "learning_rate": 3.589318984165954e-05, "loss": 0.0623, "num_input_tokens_seen": 3088344, "step": 16195 }, { "epoch": 8.41995841995842, "grad_norm": 0.12912629544734955, "learning_rate": 3.588298255280224e-05, "loss": 0.0756, "num_input_tokens_seen": 3089336, "step": 16200 }, { "epoch": 8.422557172557173, "grad_norm": 1.1346328258514404, "learning_rate": 3.587277302505608e-05, "loss": 0.1646, "num_input_tokens_seen": 3090296, "step": 16205 }, { "epoch": 8.425155925155925, "grad_norm": 0.5368542075157166, "learning_rate": 3.586256126052143e-05, "loss": 0.1377, "num_input_tokens_seen": 3091256, "step": 16210 }, { "epoch": 8.427754677754677, "grad_norm": 0.26631659269332886, "learning_rate": 3.585234726129906e-05, "loss": 0.1319, "num_input_tokens_seen": 3092184, "step": 16215 }, { "epoch": 8.43035343035343, "grad_norm": 0.4488409757614136, "learning_rate": 3.584213102949024e-05, "loss": 0.1354, "num_input_tokens_seen": 3093080, "step": 16220 }, { "epoch": 8.432952182952183, "grad_norm": 0.2472945600748062, "learning_rate": 3.583191256719672e-05, "loss": 0.1618, "num_input_tokens_seen": 3094040, "step": 16225 }, { "epoch": 8.435550935550935, "grad_norm": 0.7679522633552551, "learning_rate": 3.582169187652065e-05, "loss": 0.0961, "num_input_tokens_seen": 3094968, "step": 16230 }, { "epoch": 8.438149688149688, "grad_norm": 0.19779136776924133, "learning_rate": 3.5811468959564676e-05, "loss": 0.171, "num_input_tokens_seen": 3095864, "step": 16235 }, { "epoch": 8.440748440748441, "grad_norm": 1.8640825748443604, "learning_rate": 3.58012438184319e-05, "loss": 0.1639, "num_input_tokens_seen": 3096824, "step": 16240 }, { "epoch": 8.443347193347194, "grad_norm": 1.6205812692642212, "learning_rate": 3.579101645522589e-05, "loss": 0.2148, "num_input_tokens_seen": 3097880, "step": 16245 }, { "epoch": 8.445945945945946, "grad_norm": 0.5276767611503601, "learning_rate": 3.578078687205063e-05, "loss": 0.15, "num_input_tokens_seen": 3098808, "step": 16250 }, { "epoch": 8.448544698544698, "grad_norm": 0.12046362459659576, "learning_rate": 3.577055507101062e-05, "loss": 0.1096, "num_input_tokens_seen": 3099704, "step": 16255 }, { "epoch": 8.451143451143452, "grad_norm": 0.336731880903244, "learning_rate": 3.5760321054210754e-05, "loss": 0.0639, "num_input_tokens_seen": 3100632, "step": 16260 }, { "epoch": 8.453742203742204, "grad_norm": 0.8033262491226196, "learning_rate": 3.575008482375643e-05, "loss": 0.1661, "num_input_tokens_seen": 3101560, "step": 16265 }, { "epoch": 8.456340956340956, "grad_norm": 0.41560617089271545, "learning_rate": 3.57398463817535e-05, "loss": 0.1063, "num_input_tokens_seen": 3102456, "step": 16270 }, { "epoch": 8.45893970893971, "grad_norm": 0.07800596207380295, "learning_rate": 3.572960573030823e-05, "loss": 0.1047, "num_input_tokens_seen": 3103384, "step": 16275 }, { "epoch": 8.461538461538462, "grad_norm": 0.6963469386100769, "learning_rate": 3.5719362871527377e-05, "loss": 0.1537, "num_input_tokens_seen": 3104344, "step": 16280 }, { "epoch": 8.464137214137214, "grad_norm": 0.20596836507320404, "learning_rate": 3.5709117807518135e-05, "loss": 0.2768, "num_input_tokens_seen": 3105304, "step": 16285 }, { "epoch": 8.466735966735968, "grad_norm": 1.7475594282150269, "learning_rate": 3.5698870540388165e-05, "loss": 0.0962, "num_input_tokens_seen": 3106264, "step": 16290 }, { "epoch": 8.46933471933472, "grad_norm": 0.7254794239997864, "learning_rate": 3.568862107224557e-05, "loss": 0.1806, "num_input_tokens_seen": 3107224, "step": 16295 }, { "epoch": 8.471933471933472, "grad_norm": 0.22565679252147675, "learning_rate": 3.567836940519891e-05, "loss": 0.1518, "num_input_tokens_seen": 3108120, "step": 16300 }, { "epoch": 8.474532224532224, "grad_norm": 0.40809932351112366, "learning_rate": 3.5668115541357194e-05, "loss": 0.1305, "num_input_tokens_seen": 3109016, "step": 16305 }, { "epoch": 8.477130977130978, "grad_norm": 0.9618295431137085, "learning_rate": 3.56578594828299e-05, "loss": 0.2129, "num_input_tokens_seen": 3109912, "step": 16310 }, { "epoch": 8.47972972972973, "grad_norm": 0.14181332290172577, "learning_rate": 3.564760123172693e-05, "loss": 0.1127, "num_input_tokens_seen": 3110904, "step": 16315 }, { "epoch": 8.482328482328482, "grad_norm": 1.1704425811767578, "learning_rate": 3.5637340790158643e-05, "loss": 0.1443, "num_input_tokens_seen": 3111800, "step": 16320 }, { "epoch": 8.484927234927236, "grad_norm": 0.6620479226112366, "learning_rate": 3.562707816023588e-05, "loss": 0.1316, "num_input_tokens_seen": 3112760, "step": 16325 }, { "epoch": 8.487525987525988, "grad_norm": 0.17519675195217133, "learning_rate": 3.56168133440699e-05, "loss": 0.1287, "num_input_tokens_seen": 3113720, "step": 16330 }, { "epoch": 8.49012474012474, "grad_norm": 0.5587667226791382, "learning_rate": 3.560654634377241e-05, "loss": 0.0856, "num_input_tokens_seen": 3114648, "step": 16335 }, { "epoch": 8.492723492723492, "grad_norm": 1.4635071754455566, "learning_rate": 3.559627716145558e-05, "loss": 0.2093, "num_input_tokens_seen": 3115672, "step": 16340 }, { "epoch": 8.495322245322246, "grad_norm": 0.41496917605400085, "learning_rate": 3.558600579923204e-05, "loss": 0.1836, "num_input_tokens_seen": 3116664, "step": 16345 }, { "epoch": 8.497920997920998, "grad_norm": 0.38926810026168823, "learning_rate": 3.5575732259214836e-05, "loss": 0.1039, "num_input_tokens_seen": 3117592, "step": 16350 }, { "epoch": 8.50051975051975, "grad_norm": 0.042898256331682205, "learning_rate": 3.556545654351749e-05, "loss": 0.0544, "num_input_tokens_seen": 3118584, "step": 16355 }, { "epoch": 8.503118503118504, "grad_norm": 0.4988807141780853, "learning_rate": 3.5555178654253955e-05, "loss": 0.1666, "num_input_tokens_seen": 3119480, "step": 16360 }, { "epoch": 8.505717255717256, "grad_norm": 0.14418570697307587, "learning_rate": 3.554489859353864e-05, "loss": 0.0715, "num_input_tokens_seen": 3120408, "step": 16365 }, { "epoch": 8.508316008316008, "grad_norm": 0.5611005425453186, "learning_rate": 3.553461636348641e-05, "loss": 0.1772, "num_input_tokens_seen": 3121336, "step": 16370 }, { "epoch": 8.51091476091476, "grad_norm": 0.703079342842102, "learning_rate": 3.552433196621254e-05, "loss": 0.1878, "num_input_tokens_seen": 3122360, "step": 16375 }, { "epoch": 8.513513513513514, "grad_norm": 0.3036767244338989, "learning_rate": 3.55140454038328e-05, "loss": 0.1236, "num_input_tokens_seen": 3123288, "step": 16380 }, { "epoch": 8.516112266112266, "grad_norm": 0.12684306502342224, "learning_rate": 3.5503756678463365e-05, "loss": 0.2155, "num_input_tokens_seen": 3124152, "step": 16385 }, { "epoch": 8.518711018711018, "grad_norm": 0.9631952047348022, "learning_rate": 3.549346579222088e-05, "loss": 0.1509, "num_input_tokens_seen": 3125048, "step": 16390 }, { "epoch": 8.521309771309772, "grad_norm": 0.49374887347221375, "learning_rate": 3.548317274722242e-05, "loss": 0.2291, "num_input_tokens_seen": 3126072, "step": 16395 }, { "epoch": 8.523908523908524, "grad_norm": 1.0223674774169922, "learning_rate": 3.5472877545585516e-05, "loss": 0.2039, "num_input_tokens_seen": 3127064, "step": 16400 }, { "epoch": 8.526507276507276, "grad_norm": 0.11129415780305862, "learning_rate": 3.546258018942813e-05, "loss": 0.0453, "num_input_tokens_seen": 3128024, "step": 16405 }, { "epoch": 8.529106029106028, "grad_norm": 0.32982558012008667, "learning_rate": 3.545228068086868e-05, "loss": 0.1151, "num_input_tokens_seen": 3128920, "step": 16410 }, { "epoch": 8.531704781704782, "grad_norm": 0.7333389520645142, "learning_rate": 3.5441979022026014e-05, "loss": 0.1917, "num_input_tokens_seen": 3129848, "step": 16415 }, { "epoch": 8.534303534303534, "grad_norm": 0.6637657880783081, "learning_rate": 3.5431675215019436e-05, "loss": 0.0935, "num_input_tokens_seen": 3130776, "step": 16420 }, { "epoch": 8.536902286902286, "grad_norm": 0.12636610865592957, "learning_rate": 3.542136926196867e-05, "loss": 0.0375, "num_input_tokens_seen": 3131640, "step": 16425 }, { "epoch": 8.53950103950104, "grad_norm": 0.7634742856025696, "learning_rate": 3.541106116499391e-05, "loss": 0.1875, "num_input_tokens_seen": 3132568, "step": 16430 }, { "epoch": 8.542099792099792, "grad_norm": 0.8834359049797058, "learning_rate": 3.540075092621577e-05, "loss": 0.1273, "num_input_tokens_seen": 3133528, "step": 16435 }, { "epoch": 8.544698544698544, "grad_norm": 0.7062100172042847, "learning_rate": 3.5390438547755326e-05, "loss": 0.1188, "num_input_tokens_seen": 3134456, "step": 16440 }, { "epoch": 8.547297297297296, "grad_norm": 0.16462674736976624, "learning_rate": 3.538012403173406e-05, "loss": 0.2371, "num_input_tokens_seen": 3135416, "step": 16445 }, { "epoch": 8.54989604989605, "grad_norm": 0.36583879590034485, "learning_rate": 3.5369807380273924e-05, "loss": 0.1303, "num_input_tokens_seen": 3136344, "step": 16450 }, { "epoch": 8.552494802494802, "grad_norm": 0.5696263313293457, "learning_rate": 3.535948859549729e-05, "loss": 0.0877, "num_input_tokens_seen": 3137336, "step": 16455 }, { "epoch": 8.555093555093555, "grad_norm": 1.1392253637313843, "learning_rate": 3.5349167679527e-05, "loss": 0.1275, "num_input_tokens_seen": 3138360, "step": 16460 }, { "epoch": 8.557692307692308, "grad_norm": 0.14818792045116425, "learning_rate": 3.533884463448629e-05, "loss": 0.0414, "num_input_tokens_seen": 3139320, "step": 16465 }, { "epoch": 8.56029106029106, "grad_norm": 0.1185111552476883, "learning_rate": 3.5328519462498865e-05, "loss": 0.184, "num_input_tokens_seen": 3140216, "step": 16470 }, { "epoch": 8.562889812889813, "grad_norm": 0.10640189051628113, "learning_rate": 3.5318192165688855e-05, "loss": 0.1356, "num_input_tokens_seen": 3141208, "step": 16475 }, { "epoch": 8.565488565488565, "grad_norm": 1.1479194164276123, "learning_rate": 3.5307862746180845e-05, "loss": 0.2016, "num_input_tokens_seen": 3142296, "step": 16480 }, { "epoch": 8.568087318087318, "grad_norm": 0.7199756503105164, "learning_rate": 3.529753120609982e-05, "loss": 0.1424, "num_input_tokens_seen": 3143288, "step": 16485 }, { "epoch": 8.57068607068607, "grad_norm": 0.22925089299678802, "learning_rate": 3.528719754757124e-05, "loss": 0.0644, "num_input_tokens_seen": 3144312, "step": 16490 }, { "epoch": 8.573284823284823, "grad_norm": 1.3600889444351196, "learning_rate": 3.527686177272098e-05, "loss": 0.2357, "num_input_tokens_seen": 3145240, "step": 16495 }, { "epoch": 8.575883575883577, "grad_norm": 0.4021078646183014, "learning_rate": 3.526652388367536e-05, "loss": 0.1488, "num_input_tokens_seen": 3146168, "step": 16500 }, { "epoch": 8.578482328482329, "grad_norm": 0.2937275171279907, "learning_rate": 3.5256183882561124e-05, "loss": 0.0806, "num_input_tokens_seen": 3147160, "step": 16505 }, { "epoch": 8.58108108108108, "grad_norm": 0.3342320919036865, "learning_rate": 3.524584177150546e-05, "loss": 0.1633, "num_input_tokens_seen": 3148120, "step": 16510 }, { "epoch": 8.583679833679835, "grad_norm": 0.15210993587970734, "learning_rate": 3.5235497552635984e-05, "loss": 0.1655, "num_input_tokens_seen": 3149080, "step": 16515 }, { "epoch": 8.586278586278587, "grad_norm": 0.19205918908119202, "learning_rate": 3.522515122808075e-05, "loss": 0.1274, "num_input_tokens_seen": 3149976, "step": 16520 }, { "epoch": 8.588877338877339, "grad_norm": 1.4405075311660767, "learning_rate": 3.521480279996824e-05, "loss": 0.1364, "num_input_tokens_seen": 3150936, "step": 16525 }, { "epoch": 8.59147609147609, "grad_norm": 0.30265939235687256, "learning_rate": 3.5204452270427385e-05, "loss": 0.1522, "num_input_tokens_seen": 3151928, "step": 16530 }, { "epoch": 8.594074844074845, "grad_norm": 0.5760906338691711, "learning_rate": 3.519409964158751e-05, "loss": 0.1411, "num_input_tokens_seen": 3152888, "step": 16535 }, { "epoch": 8.596673596673597, "grad_norm": 0.9548824429512024, "learning_rate": 3.518374491557844e-05, "loss": 0.1242, "num_input_tokens_seen": 3153784, "step": 16540 }, { "epoch": 8.599272349272349, "grad_norm": 0.26047131419181824, "learning_rate": 3.517338809453034e-05, "loss": 0.1237, "num_input_tokens_seen": 3154744, "step": 16545 }, { "epoch": 8.601871101871101, "grad_norm": 0.36091601848602295, "learning_rate": 3.516302918057389e-05, "loss": 0.1317, "num_input_tokens_seen": 3155736, "step": 16550 }, { "epoch": 8.604469854469855, "grad_norm": 0.0564054399728775, "learning_rate": 3.515266817584014e-05, "loss": 0.0513, "num_input_tokens_seen": 3156664, "step": 16555 }, { "epoch": 8.607068607068607, "grad_norm": 0.10376319289207458, "learning_rate": 3.514230508246061e-05, "loss": 0.0957, "num_input_tokens_seen": 3157528, "step": 16560 }, { "epoch": 8.609667359667359, "grad_norm": 0.10874297469854355, "learning_rate": 3.513193990256725e-05, "loss": 0.0737, "num_input_tokens_seen": 3158584, "step": 16565 }, { "epoch": 8.612266112266113, "grad_norm": 0.24949626624584198, "learning_rate": 3.512157263829239e-05, "loss": 0.0941, "num_input_tokens_seen": 3159576, "step": 16570 }, { "epoch": 8.614864864864865, "grad_norm": 0.861060619354248, "learning_rate": 3.511120329176885e-05, "loss": 0.1479, "num_input_tokens_seen": 3160504, "step": 16575 }, { "epoch": 8.617463617463617, "grad_norm": 1.0351645946502686, "learning_rate": 3.510083186512984e-05, "loss": 0.1611, "num_input_tokens_seen": 3161400, "step": 16580 }, { "epoch": 8.62006237006237, "grad_norm": 0.15260310471057892, "learning_rate": 3.5090458360509024e-05, "loss": 0.1461, "num_input_tokens_seen": 3162360, "step": 16585 }, { "epoch": 8.622661122661123, "grad_norm": 1.5958157777786255, "learning_rate": 3.5080082780040455e-05, "loss": 0.1814, "num_input_tokens_seen": 3163256, "step": 16590 }, { "epoch": 8.625259875259875, "grad_norm": 0.7144147753715515, "learning_rate": 3.5069705125858655e-05, "loss": 0.1594, "num_input_tokens_seen": 3164184, "step": 16595 }, { "epoch": 8.627858627858627, "grad_norm": 1.0262820720672607, "learning_rate": 3.505932540009854e-05, "loss": 0.262, "num_input_tokens_seen": 3165112, "step": 16600 }, { "epoch": 8.630457380457381, "grad_norm": 0.34495988488197327, "learning_rate": 3.504894360489549e-05, "loss": 0.1536, "num_input_tokens_seen": 3166072, "step": 16605 }, { "epoch": 8.633056133056133, "grad_norm": 0.40294545888900757, "learning_rate": 3.503855974238527e-05, "loss": 0.0554, "num_input_tokens_seen": 3167032, "step": 16610 }, { "epoch": 8.635654885654885, "grad_norm": 0.15593498945236206, "learning_rate": 3.502817381470409e-05, "loss": 0.0914, "num_input_tokens_seen": 3167992, "step": 16615 }, { "epoch": 8.638253638253639, "grad_norm": 0.45703476667404175, "learning_rate": 3.501778582398859e-05, "loss": 0.1864, "num_input_tokens_seen": 3168920, "step": 16620 }, { "epoch": 8.640852390852391, "grad_norm": 1.0458168983459473, "learning_rate": 3.5007395772375815e-05, "loss": 0.1854, "num_input_tokens_seen": 3169912, "step": 16625 }, { "epoch": 8.643451143451143, "grad_norm": 0.7024351358413696, "learning_rate": 3.499700366200326e-05, "loss": 0.0787, "num_input_tokens_seen": 3170840, "step": 16630 }, { "epoch": 8.646049896049895, "grad_norm": 0.8979767560958862, "learning_rate": 3.498660949500882e-05, "loss": 0.168, "num_input_tokens_seen": 3171768, "step": 16635 }, { "epoch": 8.64864864864865, "grad_norm": 0.4326724410057068, "learning_rate": 3.497621327353083e-05, "loss": 0.1, "num_input_tokens_seen": 3172696, "step": 16640 }, { "epoch": 8.651247401247401, "grad_norm": 0.4435308575630188, "learning_rate": 3.496581499970804e-05, "loss": 0.2051, "num_input_tokens_seen": 3173592, "step": 16645 }, { "epoch": 8.653846153846153, "grad_norm": 0.23093612492084503, "learning_rate": 3.495541467567961e-05, "loss": 0.0979, "num_input_tokens_seen": 3174520, "step": 16650 }, { "epoch": 8.656444906444907, "grad_norm": 0.6159910559654236, "learning_rate": 3.494501230358515e-05, "loss": 0.068, "num_input_tokens_seen": 3175480, "step": 16655 }, { "epoch": 8.65904365904366, "grad_norm": 0.840406596660614, "learning_rate": 3.493460788556467e-05, "loss": 0.1122, "num_input_tokens_seen": 3176440, "step": 16660 }, { "epoch": 8.661642411642411, "grad_norm": 0.900252103805542, "learning_rate": 3.4924201423758606e-05, "loss": 0.2243, "num_input_tokens_seen": 3177432, "step": 16665 }, { "epoch": 8.664241164241163, "grad_norm": 0.33161821961402893, "learning_rate": 3.491379292030781e-05, "loss": 0.0709, "num_input_tokens_seen": 3178392, "step": 16670 }, { "epoch": 8.666839916839917, "grad_norm": 1.2557734251022339, "learning_rate": 3.490338237735358e-05, "loss": 0.1722, "num_input_tokens_seen": 3179416, "step": 16675 }, { "epoch": 8.66943866943867, "grad_norm": 1.4183558225631714, "learning_rate": 3.489296979703758e-05, "loss": 0.2002, "num_input_tokens_seen": 3180344, "step": 16680 }, { "epoch": 8.672037422037421, "grad_norm": 0.10350113362073898, "learning_rate": 3.488255518150195e-05, "loss": 0.1351, "num_input_tokens_seen": 3181208, "step": 16685 }, { "epoch": 8.674636174636175, "grad_norm": 1.1191993951797485, "learning_rate": 3.487213853288922e-05, "loss": 0.0917, "num_input_tokens_seen": 3182200, "step": 16690 }, { "epoch": 8.677234927234927, "grad_norm": 0.17143285274505615, "learning_rate": 3.486171985334234e-05, "loss": 0.0345, "num_input_tokens_seen": 3183160, "step": 16695 }, { "epoch": 8.67983367983368, "grad_norm": 1.3577136993408203, "learning_rate": 3.485129914500468e-05, "loss": 0.2059, "num_input_tokens_seen": 3184152, "step": 16700 }, { "epoch": 8.682432432432432, "grad_norm": 0.29311811923980713, "learning_rate": 3.4840876410020035e-05, "loss": 0.1408, "num_input_tokens_seen": 3185144, "step": 16705 }, { "epoch": 8.685031185031185, "grad_norm": 0.5091372728347778, "learning_rate": 3.4830451650532604e-05, "loss": 0.0994, "num_input_tokens_seen": 3186104, "step": 16710 }, { "epoch": 8.687629937629938, "grad_norm": 0.9135188460350037, "learning_rate": 3.4820024868687e-05, "loss": 0.1501, "num_input_tokens_seen": 3187064, "step": 16715 }, { "epoch": 8.69022869022869, "grad_norm": 0.8741414546966553, "learning_rate": 3.480959606662828e-05, "loss": 0.0697, "num_input_tokens_seen": 3187960, "step": 16720 }, { "epoch": 8.692827442827443, "grad_norm": 0.5529941916465759, "learning_rate": 3.479916524650188e-05, "loss": 0.1112, "num_input_tokens_seen": 3188984, "step": 16725 }, { "epoch": 8.695426195426196, "grad_norm": 0.12094960361719131, "learning_rate": 3.4788732410453676e-05, "loss": 0.0749, "num_input_tokens_seen": 3189912, "step": 16730 }, { "epoch": 8.698024948024948, "grad_norm": 0.9691567420959473, "learning_rate": 3.4778297560629956e-05, "loss": 0.0984, "num_input_tokens_seen": 3190840, "step": 16735 }, { "epoch": 8.700623700623701, "grad_norm": 0.36055004596710205, "learning_rate": 3.4767860699177404e-05, "loss": 0.0467, "num_input_tokens_seen": 3191768, "step": 16740 }, { "epoch": 8.703222453222454, "grad_norm": 1.7319319248199463, "learning_rate": 3.475742182824314e-05, "loss": 0.123, "num_input_tokens_seen": 3192632, "step": 16745 }, { "epoch": 8.705821205821206, "grad_norm": 0.5373045802116394, "learning_rate": 3.474698094997469e-05, "loss": 0.1507, "num_input_tokens_seen": 3193656, "step": 16750 }, { "epoch": 8.708419958419958, "grad_norm": 0.056584328413009644, "learning_rate": 3.473653806651998e-05, "loss": 0.1131, "num_input_tokens_seen": 3194648, "step": 16755 }, { "epoch": 8.711018711018712, "grad_norm": 0.3418041467666626, "learning_rate": 3.472609318002737e-05, "loss": 0.0481, "num_input_tokens_seen": 3195608, "step": 16760 }, { "epoch": 8.713617463617464, "grad_norm": 0.7543900012969971, "learning_rate": 3.471564629264563e-05, "loss": 0.1657, "num_input_tokens_seen": 3196536, "step": 16765 }, { "epoch": 8.716216216216216, "grad_norm": 0.16890807449817657, "learning_rate": 3.470519740652393e-05, "loss": 0.1468, "num_input_tokens_seen": 3197432, "step": 16770 }, { "epoch": 8.71881496881497, "grad_norm": 0.3502025306224823, "learning_rate": 3.4694746523811826e-05, "loss": 0.3063, "num_input_tokens_seen": 3198424, "step": 16775 }, { "epoch": 8.721413721413722, "grad_norm": 0.1680581122636795, "learning_rate": 3.468429364665935e-05, "loss": 0.094, "num_input_tokens_seen": 3199384, "step": 16780 }, { "epoch": 8.724012474012474, "grad_norm": 0.07202986627817154, "learning_rate": 3.467383877721689e-05, "loss": 0.1101, "num_input_tokens_seen": 3200344, "step": 16785 }, { "epoch": 8.726611226611226, "grad_norm": 0.3714849650859833, "learning_rate": 3.466338191763528e-05, "loss": 0.0804, "num_input_tokens_seen": 3201304, "step": 16790 }, { "epoch": 8.72920997920998, "grad_norm": 0.337440550327301, "learning_rate": 3.465292307006571e-05, "loss": 0.0884, "num_input_tokens_seen": 3202264, "step": 16795 }, { "epoch": 8.731808731808732, "grad_norm": 0.9392799735069275, "learning_rate": 3.464246223665985e-05, "loss": 0.2116, "num_input_tokens_seen": 3203192, "step": 16800 }, { "epoch": 8.734407484407484, "grad_norm": 1.5264010429382324, "learning_rate": 3.463199941956973e-05, "loss": 0.1206, "num_input_tokens_seen": 3204120, "step": 16805 }, { "epoch": 8.737006237006238, "grad_norm": 0.8440771102905273, "learning_rate": 3.4621534620947785e-05, "loss": 0.0747, "num_input_tokens_seen": 3205080, "step": 16810 }, { "epoch": 8.73960498960499, "grad_norm": 0.6708743572235107, "learning_rate": 3.4611067842946895e-05, "loss": 0.1738, "num_input_tokens_seen": 3205944, "step": 16815 }, { "epoch": 8.742203742203742, "grad_norm": 1.2347012758255005, "learning_rate": 3.460059908772032e-05, "loss": 0.1404, "num_input_tokens_seen": 3206840, "step": 16820 }, { "epoch": 8.744802494802494, "grad_norm": 1.0702455043792725, "learning_rate": 3.4590128357421737e-05, "loss": 0.1575, "num_input_tokens_seen": 3207768, "step": 16825 }, { "epoch": 8.747401247401248, "grad_norm": 0.5479921698570251, "learning_rate": 3.4579655654205206e-05, "loss": 0.214, "num_input_tokens_seen": 3208760, "step": 16830 }, { "epoch": 8.75, "grad_norm": 0.7713848352432251, "learning_rate": 3.456918098022523e-05, "loss": 0.1897, "num_input_tokens_seen": 3209720, "step": 16835 }, { "epoch": 8.752598752598752, "grad_norm": 0.1297743022441864, "learning_rate": 3.4558704337636693e-05, "loss": 0.2847, "num_input_tokens_seen": 3210616, "step": 16840 }, { "epoch": 8.755197505197506, "grad_norm": 0.7083470225334167, "learning_rate": 3.4548225728594896e-05, "loss": 0.1451, "num_input_tokens_seen": 3211576, "step": 16845 }, { "epoch": 8.757796257796258, "grad_norm": 0.3482667803764343, "learning_rate": 3.4537745155255525e-05, "loss": 0.1879, "num_input_tokens_seen": 3212568, "step": 16850 }, { "epoch": 8.76039501039501, "grad_norm": 0.6618008613586426, "learning_rate": 3.452726261977469e-05, "loss": 0.2407, "num_input_tokens_seen": 3213560, "step": 16855 }, { "epoch": 8.762993762993762, "grad_norm": 1.1647676229476929, "learning_rate": 3.45167781243089e-05, "loss": 0.1239, "num_input_tokens_seen": 3214456, "step": 16860 }, { "epoch": 8.765592515592516, "grad_norm": 0.21164323389530182, "learning_rate": 3.450629167101507e-05, "loss": 0.1026, "num_input_tokens_seen": 3215384, "step": 16865 }, { "epoch": 8.768191268191268, "grad_norm": 0.897373616695404, "learning_rate": 3.449580326205051e-05, "loss": 0.0737, "num_input_tokens_seen": 3216280, "step": 16870 }, { "epoch": 8.77079002079002, "grad_norm": 0.29517680406570435, "learning_rate": 3.448531289957293e-05, "loss": 0.07, "num_input_tokens_seen": 3217208, "step": 16875 }, { "epoch": 8.773388773388774, "grad_norm": 1.43192458152771, "learning_rate": 3.447482058574045e-05, "loss": 0.1339, "num_input_tokens_seen": 3218168, "step": 16880 }, { "epoch": 8.775987525987526, "grad_norm": 0.5906874537467957, "learning_rate": 3.446432632271159e-05, "loss": 0.05, "num_input_tokens_seen": 3219160, "step": 16885 }, { "epoch": 8.778586278586278, "grad_norm": 0.7710105776786804, "learning_rate": 3.445383011264528e-05, "loss": 0.1972, "num_input_tokens_seen": 3220312, "step": 16890 }, { "epoch": 8.78118503118503, "grad_norm": 0.21354489028453827, "learning_rate": 3.444333195770082e-05, "loss": 0.1469, "num_input_tokens_seen": 3221272, "step": 16895 }, { "epoch": 8.783783783783784, "grad_norm": 0.1482516974210739, "learning_rate": 3.443283186003795e-05, "loss": 0.0358, "num_input_tokens_seen": 3222168, "step": 16900 }, { "epoch": 8.786382536382536, "grad_norm": 0.2918083965778351, "learning_rate": 3.442232982181679e-05, "loss": 0.1932, "num_input_tokens_seen": 3223160, "step": 16905 }, { "epoch": 8.788981288981288, "grad_norm": 0.8287760615348816, "learning_rate": 3.441182584519783e-05, "loss": 0.2541, "num_input_tokens_seen": 3224120, "step": 16910 }, { "epoch": 8.791580041580042, "grad_norm": 0.13509654998779297, "learning_rate": 3.4401319932342024e-05, "loss": 0.0864, "num_input_tokens_seen": 3225048, "step": 16915 }, { "epoch": 8.794178794178794, "grad_norm": 1.7802553176879883, "learning_rate": 3.439081208541067e-05, "loss": 0.2467, "num_input_tokens_seen": 3225976, "step": 16920 }, { "epoch": 8.796777546777546, "grad_norm": 0.15240830183029175, "learning_rate": 3.4380302306565495e-05, "loss": 0.2011, "num_input_tokens_seen": 3226872, "step": 16925 }, { "epoch": 8.799376299376299, "grad_norm": 1.6790709495544434, "learning_rate": 3.436979059796859e-05, "loss": 0.1714, "num_input_tokens_seen": 3227800, "step": 16930 }, { "epoch": 8.801975051975052, "grad_norm": 0.7290529012680054, "learning_rate": 3.435927696178249e-05, "loss": 0.1526, "num_input_tokens_seen": 3228760, "step": 16935 }, { "epoch": 8.804573804573804, "grad_norm": 0.14318741858005524, "learning_rate": 3.434876140017008e-05, "loss": 0.1302, "num_input_tokens_seen": 3229720, "step": 16940 }, { "epoch": 8.807172557172557, "grad_norm": 0.6828780174255371, "learning_rate": 3.4338243915294675e-05, "loss": 0.1399, "num_input_tokens_seen": 3230648, "step": 16945 }, { "epoch": 8.80977130977131, "grad_norm": 0.4901238679885864, "learning_rate": 3.432772450931996e-05, "loss": 0.1448, "num_input_tokens_seen": 3231576, "step": 16950 }, { "epoch": 8.812370062370062, "grad_norm": 0.46133995056152344, "learning_rate": 3.431720318441004e-05, "loss": 0.0963, "num_input_tokens_seen": 3232568, "step": 16955 }, { "epoch": 8.814968814968815, "grad_norm": 0.1609533131122589, "learning_rate": 3.43066799427294e-05, "loss": 0.2907, "num_input_tokens_seen": 3233464, "step": 16960 }, { "epoch": 8.817567567567568, "grad_norm": 0.9730597138404846, "learning_rate": 3.4296154786442905e-05, "loss": 0.1772, "num_input_tokens_seen": 3234456, "step": 16965 }, { "epoch": 8.82016632016632, "grad_norm": 0.10696469247341156, "learning_rate": 3.4285627717715854e-05, "loss": 0.1118, "num_input_tokens_seen": 3235480, "step": 16970 }, { "epoch": 8.822765072765073, "grad_norm": 0.12959694862365723, "learning_rate": 3.42750987387139e-05, "loss": 0.1281, "num_input_tokens_seen": 3236440, "step": 16975 }, { "epoch": 8.825363825363825, "grad_norm": 0.9162874221801758, "learning_rate": 3.426456785160311e-05, "loss": 0.098, "num_input_tokens_seen": 3237400, "step": 16980 }, { "epoch": 8.827962577962579, "grad_norm": 1.7448924779891968, "learning_rate": 3.4254035058549936e-05, "loss": 0.2247, "num_input_tokens_seen": 3238296, "step": 16985 }, { "epoch": 8.83056133056133, "grad_norm": 1.0256881713867188, "learning_rate": 3.424350036172124e-05, "loss": 0.1339, "num_input_tokens_seen": 3239256, "step": 16990 }, { "epoch": 8.833160083160083, "grad_norm": 0.39320823550224304, "learning_rate": 3.423296376328423e-05, "loss": 0.1181, "num_input_tokens_seen": 3240184, "step": 16995 }, { "epoch": 8.835758835758837, "grad_norm": 0.36434096097946167, "learning_rate": 3.422242526540657e-05, "loss": 0.1148, "num_input_tokens_seen": 3241112, "step": 17000 }, { "epoch": 8.838357588357589, "grad_norm": 0.13296541571617126, "learning_rate": 3.421188487025625e-05, "loss": 0.087, "num_input_tokens_seen": 3242040, "step": 17005 }, { "epoch": 8.84095634095634, "grad_norm": 1.2743256092071533, "learning_rate": 3.420134258000169e-05, "loss": 0.2248, "num_input_tokens_seen": 3243032, "step": 17010 }, { "epoch": 8.843555093555093, "grad_norm": 1.3116416931152344, "learning_rate": 3.4190798396811706e-05, "loss": 0.2527, "num_input_tokens_seen": 3244056, "step": 17015 }, { "epoch": 8.846153846153847, "grad_norm": 0.4931081235408783, "learning_rate": 3.418025232285547e-05, "loss": 0.1841, "num_input_tokens_seen": 3244984, "step": 17020 }, { "epoch": 8.848752598752599, "grad_norm": 0.8398978114128113, "learning_rate": 3.416970436030257e-05, "loss": 0.0754, "num_input_tokens_seen": 3245880, "step": 17025 }, { "epoch": 8.85135135135135, "grad_norm": 0.46598052978515625, "learning_rate": 3.415915451132297e-05, "loss": 0.1512, "num_input_tokens_seen": 3246840, "step": 17030 }, { "epoch": 8.853950103950105, "grad_norm": 0.9314407110214233, "learning_rate": 3.414860277808701e-05, "loss": 0.0895, "num_input_tokens_seen": 3247768, "step": 17035 }, { "epoch": 8.856548856548857, "grad_norm": 1.1559873819351196, "learning_rate": 3.413804916276546e-05, "loss": 0.1536, "num_input_tokens_seen": 3248728, "step": 17040 }, { "epoch": 8.859147609147609, "grad_norm": 0.8219788670539856, "learning_rate": 3.4127493667529436e-05, "loss": 0.2784, "num_input_tokens_seen": 3249656, "step": 17045 }, { "epoch": 8.861746361746361, "grad_norm": 0.6758630275726318, "learning_rate": 3.411693629455046e-05, "loss": 0.2243, "num_input_tokens_seen": 3250552, "step": 17050 }, { "epoch": 8.864345114345115, "grad_norm": 0.66642165184021, "learning_rate": 3.410637704600042e-05, "loss": 0.111, "num_input_tokens_seen": 3251544, "step": 17055 }, { "epoch": 8.866943866943867, "grad_norm": 0.37617990374565125, "learning_rate": 3.4095815924051635e-05, "loss": 0.2012, "num_input_tokens_seen": 3252472, "step": 17060 }, { "epoch": 8.869542619542619, "grad_norm": 0.18039099872112274, "learning_rate": 3.4085252930876747e-05, "loss": 0.0768, "num_input_tokens_seen": 3253368, "step": 17065 }, { "epoch": 8.872141372141373, "grad_norm": 0.7594520449638367, "learning_rate": 3.407468806864883e-05, "loss": 0.1777, "num_input_tokens_seen": 3254296, "step": 17070 }, { "epoch": 8.874740124740125, "grad_norm": 1.0666894912719727, "learning_rate": 3.406412133954133e-05, "loss": 0.1675, "num_input_tokens_seen": 3255288, "step": 17075 }, { "epoch": 8.877338877338877, "grad_norm": 0.8702863454818726, "learning_rate": 3.405355274572808e-05, "loss": 0.1499, "num_input_tokens_seen": 3256248, "step": 17080 }, { "epoch": 8.87993762993763, "grad_norm": 0.1049475148320198, "learning_rate": 3.404298228938328e-05, "loss": 0.2074, "num_input_tokens_seen": 3257176, "step": 17085 }, { "epoch": 8.882536382536383, "grad_norm": 1.5377939939498901, "learning_rate": 3.4032409972681524e-05, "loss": 0.1265, "num_input_tokens_seen": 3258232, "step": 17090 }, { "epoch": 8.885135135135135, "grad_norm": 0.8521652817726135, "learning_rate": 3.4021835797797804e-05, "loss": 0.1115, "num_input_tokens_seen": 3259128, "step": 17095 }, { "epoch": 8.887733887733887, "grad_norm": 0.26002851128578186, "learning_rate": 3.4011259766907464e-05, "loss": 0.1696, "num_input_tokens_seen": 3260088, "step": 17100 }, { "epoch": 8.890332640332641, "grad_norm": 1.1069709062576294, "learning_rate": 3.4000681882186254e-05, "loss": 0.1789, "num_input_tokens_seen": 3261080, "step": 17105 }, { "epoch": 8.892931392931393, "grad_norm": 0.7209876775741577, "learning_rate": 3.399010214581029e-05, "loss": 0.0954, "num_input_tokens_seen": 3261944, "step": 17110 }, { "epoch": 8.895530145530145, "grad_norm": 1.2655364274978638, "learning_rate": 3.397952055995608e-05, "loss": 0.1123, "num_input_tokens_seen": 3262936, "step": 17115 }, { "epoch": 8.898128898128899, "grad_norm": 0.9749229550361633, "learning_rate": 3.3968937126800514e-05, "loss": 0.0959, "num_input_tokens_seen": 3263928, "step": 17120 }, { "epoch": 8.900727650727651, "grad_norm": 0.5152829885482788, "learning_rate": 3.3958351848520855e-05, "loss": 0.1639, "num_input_tokens_seen": 3264888, "step": 17125 }, { "epoch": 8.903326403326403, "grad_norm": 0.888173520565033, "learning_rate": 3.394776472729474e-05, "loss": 0.0871, "num_input_tokens_seen": 3265848, "step": 17130 }, { "epoch": 8.905925155925155, "grad_norm": 1.0222834348678589, "learning_rate": 3.393717576530019e-05, "loss": 0.1874, "num_input_tokens_seen": 3266840, "step": 17135 }, { "epoch": 8.90852390852391, "grad_norm": 0.5664945840835571, "learning_rate": 3.392658496471562e-05, "loss": 0.1094, "num_input_tokens_seen": 3267832, "step": 17140 }, { "epoch": 8.911122661122661, "grad_norm": 0.775693953037262, "learning_rate": 3.39159923277198e-05, "loss": 0.1242, "num_input_tokens_seen": 3268728, "step": 17145 }, { "epoch": 8.913721413721413, "grad_norm": 1.1659356355667114, "learning_rate": 3.3905397856491894e-05, "loss": 0.1966, "num_input_tokens_seen": 3269720, "step": 17150 }, { "epoch": 8.916320166320165, "grad_norm": 0.2962723672389984, "learning_rate": 3.389480155321143e-05, "loss": 0.2207, "num_input_tokens_seen": 3270616, "step": 17155 }, { "epoch": 8.91891891891892, "grad_norm": 0.16962359845638275, "learning_rate": 3.388420342005833e-05, "loss": 0.1339, "num_input_tokens_seen": 3271544, "step": 17160 }, { "epoch": 8.921517671517671, "grad_norm": 0.836139976978302, "learning_rate": 3.387360345921286e-05, "loss": 0.1031, "num_input_tokens_seen": 3272472, "step": 17165 }, { "epoch": 8.924116424116423, "grad_norm": 0.6169506311416626, "learning_rate": 3.386300167285571e-05, "loss": 0.0821, "num_input_tokens_seen": 3273496, "step": 17170 }, { "epoch": 8.926715176715177, "grad_norm": 0.4421449601650238, "learning_rate": 3.385239806316792e-05, "loss": 0.2392, "num_input_tokens_seen": 3274488, "step": 17175 }, { "epoch": 8.92931392931393, "grad_norm": 0.9750627279281616, "learning_rate": 3.384179263233088e-05, "loss": 0.1466, "num_input_tokens_seen": 3275384, "step": 17180 }, { "epoch": 8.931912681912682, "grad_norm": 0.1352241039276123, "learning_rate": 3.38311853825264e-05, "loss": 0.0983, "num_input_tokens_seen": 3276376, "step": 17185 }, { "epoch": 8.934511434511435, "grad_norm": 1.0002771615982056, "learning_rate": 3.382057631593665e-05, "loss": 0.1347, "num_input_tokens_seen": 3277368, "step": 17190 }, { "epoch": 8.937110187110187, "grad_norm": 0.1410733312368393, "learning_rate": 3.3809965434744154e-05, "loss": 0.1212, "num_input_tokens_seen": 3278328, "step": 17195 }, { "epoch": 8.93970893970894, "grad_norm": 0.2966572940349579, "learning_rate": 3.3799352741131815e-05, "loss": 0.0518, "num_input_tokens_seen": 3279320, "step": 17200 }, { "epoch": 8.942307692307692, "grad_norm": 0.16726292669773102, "learning_rate": 3.378873823728294e-05, "loss": 0.1051, "num_input_tokens_seen": 3280216, "step": 17205 }, { "epoch": 8.944906444906445, "grad_norm": 0.7351220846176147, "learning_rate": 3.3778121925381176e-05, "loss": 0.1546, "num_input_tokens_seen": 3281176, "step": 17210 }, { "epoch": 8.947505197505198, "grad_norm": 0.40864866971969604, "learning_rate": 3.376750380761054e-05, "loss": 0.1243, "num_input_tokens_seen": 3282168, "step": 17215 }, { "epoch": 8.95010395010395, "grad_norm": 0.5013140439987183, "learning_rate": 3.375688388615545e-05, "loss": 0.1558, "num_input_tokens_seen": 3283128, "step": 17220 }, { "epoch": 8.952702702702704, "grad_norm": 0.30814939737319946, "learning_rate": 3.374626216320066e-05, "loss": 0.2617, "num_input_tokens_seen": 3284088, "step": 17225 }, { "epoch": 8.955301455301456, "grad_norm": 0.600069522857666, "learning_rate": 3.373563864093133e-05, "loss": 0.1021, "num_input_tokens_seen": 3285016, "step": 17230 }, { "epoch": 8.957900207900208, "grad_norm": 0.841774046421051, "learning_rate": 3.372501332153296e-05, "loss": 0.1255, "num_input_tokens_seen": 3285976, "step": 17235 }, { "epoch": 8.96049896049896, "grad_norm": 0.13253509998321533, "learning_rate": 3.371438620719142e-05, "loss": 0.0919, "num_input_tokens_seen": 3286936, "step": 17240 }, { "epoch": 8.963097713097714, "grad_norm": 0.12714900076389313, "learning_rate": 3.370375730009299e-05, "loss": 0.273, "num_input_tokens_seen": 3287928, "step": 17245 }, { "epoch": 8.965696465696466, "grad_norm": 0.6151113510131836, "learning_rate": 3.369312660242426e-05, "loss": 0.1069, "num_input_tokens_seen": 3288888, "step": 17250 }, { "epoch": 8.968295218295218, "grad_norm": 0.11934313178062439, "learning_rate": 3.368249411637224e-05, "loss": 0.1177, "num_input_tokens_seen": 3289848, "step": 17255 }, { "epoch": 8.970893970893972, "grad_norm": 0.7067509293556213, "learning_rate": 3.3671859844124277e-05, "loss": 0.1624, "num_input_tokens_seen": 3290840, "step": 17260 }, { "epoch": 8.973492723492724, "grad_norm": 0.7689558863639832, "learning_rate": 3.3661223787868094e-05, "loss": 0.1053, "num_input_tokens_seen": 3291800, "step": 17265 }, { "epoch": 8.976091476091476, "grad_norm": 0.5722559094429016, "learning_rate": 3.365058594979178e-05, "loss": 0.2142, "num_input_tokens_seen": 3292824, "step": 17270 }, { "epoch": 8.978690228690228, "grad_norm": 0.6691731214523315, "learning_rate": 3.363994633208379e-05, "loss": 0.1093, "num_input_tokens_seen": 3293720, "step": 17275 }, { "epoch": 8.981288981288982, "grad_norm": 0.5634105801582336, "learning_rate": 3.362930493693295e-05, "loss": 0.1455, "num_input_tokens_seen": 3294552, "step": 17280 }, { "epoch": 8.983887733887734, "grad_norm": 0.2860109210014343, "learning_rate": 3.3618661766528446e-05, "loss": 0.1313, "num_input_tokens_seen": 3295544, "step": 17285 }, { "epoch": 8.986486486486486, "grad_norm": 1.2232955694198608, "learning_rate": 3.360801682305983e-05, "loss": 0.0827, "num_input_tokens_seen": 3296504, "step": 17290 }, { "epoch": 8.98908523908524, "grad_norm": 0.7153921723365784, "learning_rate": 3.359737010871704e-05, "loss": 0.0893, "num_input_tokens_seen": 3297464, "step": 17295 }, { "epoch": 8.991683991683992, "grad_norm": 0.862789511680603, "learning_rate": 3.3586721625690335e-05, "loss": 0.251, "num_input_tokens_seen": 3298424, "step": 17300 }, { "epoch": 8.994282744282744, "grad_norm": 1.0422855615615845, "learning_rate": 3.3576071376170374e-05, "loss": 0.1729, "num_input_tokens_seen": 3299448, "step": 17305 }, { "epoch": 8.996881496881496, "grad_norm": 1.9831641912460327, "learning_rate": 3.356541936234816e-05, "loss": 0.1388, "num_input_tokens_seen": 3300376, "step": 17310 }, { "epoch": 8.99948024948025, "grad_norm": 0.22005999088287354, "learning_rate": 3.355476558641507e-05, "loss": 0.1965, "num_input_tokens_seen": 3301304, "step": 17315 }, { "epoch": 9.0, "eval_loss": 0.1477271467447281, "eval_runtime": 8.0207, "eval_samples_per_second": 106.724, "eval_steps_per_second": 26.681, "num_input_tokens_seen": 3301448, "step": 17316 }, { "epoch": 9.002079002079002, "grad_norm": 0.7779337763786316, "learning_rate": 3.3544110050562836e-05, "loss": 0.1837, "num_input_tokens_seen": 3302152, "step": 17320 }, { "epoch": 9.004677754677754, "grad_norm": 0.5982151031494141, "learning_rate": 3.353345275698356e-05, "loss": 0.0611, "num_input_tokens_seen": 3303080, "step": 17325 }, { "epoch": 9.007276507276508, "grad_norm": 0.7530145645141602, "learning_rate": 3.35227937078697e-05, "loss": 0.1421, "num_input_tokens_seen": 3304008, "step": 17330 }, { "epoch": 9.00987525987526, "grad_norm": 1.2133606672286987, "learning_rate": 3.3512132905414085e-05, "loss": 0.2379, "num_input_tokens_seen": 3304936, "step": 17335 }, { "epoch": 9.012474012474012, "grad_norm": 0.36894315481185913, "learning_rate": 3.350147035180987e-05, "loss": 0.0966, "num_input_tokens_seen": 3305928, "step": 17340 }, { "epoch": 9.015072765072764, "grad_norm": 0.1381673663854599, "learning_rate": 3.349080604925063e-05, "loss": 0.0403, "num_input_tokens_seen": 3306888, "step": 17345 }, { "epoch": 9.017671517671518, "grad_norm": 0.9501680731773376, "learning_rate": 3.348013999993024e-05, "loss": 0.2043, "num_input_tokens_seen": 3307816, "step": 17350 }, { "epoch": 9.02027027027027, "grad_norm": 0.09378378838300705, "learning_rate": 3.346947220604298e-05, "loss": 0.1675, "num_input_tokens_seen": 3308744, "step": 17355 }, { "epoch": 9.022869022869022, "grad_norm": 1.5559372901916504, "learning_rate": 3.3458802669783454e-05, "loss": 0.1452, "num_input_tokens_seen": 3309704, "step": 17360 }, { "epoch": 9.025467775467776, "grad_norm": 0.8407475352287292, "learning_rate": 3.3448131393346645e-05, "loss": 0.2522, "num_input_tokens_seen": 3310632, "step": 17365 }, { "epoch": 9.028066528066528, "grad_norm": 0.5047258734703064, "learning_rate": 3.343745837892789e-05, "loss": 0.173, "num_input_tokens_seen": 3311528, "step": 17370 }, { "epoch": 9.03066528066528, "grad_norm": 0.5228941440582275, "learning_rate": 3.342678362872288e-05, "loss": 0.1721, "num_input_tokens_seen": 3312456, "step": 17375 }, { "epoch": 9.033264033264032, "grad_norm": 0.1947401762008667, "learning_rate": 3.3416107144927675e-05, "loss": 0.1302, "num_input_tokens_seen": 3313448, "step": 17380 }, { "epoch": 9.035862785862786, "grad_norm": 0.6975774168968201, "learning_rate": 3.340542892973867e-05, "loss": 0.1699, "num_input_tokens_seen": 3314376, "step": 17385 }, { "epoch": 9.038461538461538, "grad_norm": 1.2080726623535156, "learning_rate": 3.339474898535263e-05, "loss": 0.117, "num_input_tokens_seen": 3315336, "step": 17390 }, { "epoch": 9.04106029106029, "grad_norm": 1.0736334323883057, "learning_rate": 3.338406731396668e-05, "loss": 0.104, "num_input_tokens_seen": 3316328, "step": 17395 }, { "epoch": 9.043659043659044, "grad_norm": 0.7703151106834412, "learning_rate": 3.33733839177783e-05, "loss": 0.1521, "num_input_tokens_seen": 3317288, "step": 17400 }, { "epoch": 9.046257796257796, "grad_norm": 0.7240911722183228, "learning_rate": 3.33626987989853e-05, "loss": 0.0798, "num_input_tokens_seen": 3318248, "step": 17405 }, { "epoch": 9.048856548856548, "grad_norm": 2.2901854515075684, "learning_rate": 3.335201195978588e-05, "loss": 0.2255, "num_input_tokens_seen": 3319176, "step": 17410 }, { "epoch": 9.051455301455302, "grad_norm": 0.1758880913257599, "learning_rate": 3.3341323402378565e-05, "loss": 0.0512, "num_input_tokens_seen": 3320136, "step": 17415 }, { "epoch": 9.054054054054054, "grad_norm": 0.9119316339492798, "learning_rate": 3.333063312896225e-05, "loss": 0.1359, "num_input_tokens_seen": 3321064, "step": 17420 }, { "epoch": 9.056652806652806, "grad_norm": 0.9921808838844299, "learning_rate": 3.3319941141736196e-05, "loss": 0.1611, "num_input_tokens_seen": 3322056, "step": 17425 }, { "epoch": 9.059251559251559, "grad_norm": 0.5649247169494629, "learning_rate": 3.3309247442899974e-05, "loss": 0.1017, "num_input_tokens_seen": 3323080, "step": 17430 }, { "epoch": 9.061850311850312, "grad_norm": 0.3487137258052826, "learning_rate": 3.329855203465355e-05, "loss": 0.1389, "num_input_tokens_seen": 3324008, "step": 17435 }, { "epoch": 9.064449064449065, "grad_norm": 0.6795022487640381, "learning_rate": 3.328785491919721e-05, "loss": 0.1213, "num_input_tokens_seen": 3324936, "step": 17440 }, { "epoch": 9.067047817047817, "grad_norm": 0.956584095954895, "learning_rate": 3.3277156098731624e-05, "loss": 0.1412, "num_input_tokens_seen": 3325896, "step": 17445 }, { "epoch": 9.06964656964657, "grad_norm": 0.8438979387283325, "learning_rate": 3.326645557545778e-05, "loss": 0.0974, "num_input_tokens_seen": 3326856, "step": 17450 }, { "epoch": 9.072245322245323, "grad_norm": 1.1324809789657593, "learning_rate": 3.325575335157703e-05, "loss": 0.1209, "num_input_tokens_seen": 3327784, "step": 17455 }, { "epoch": 9.074844074844075, "grad_norm": 1.2735787630081177, "learning_rate": 3.324504942929109e-05, "loss": 0.1451, "num_input_tokens_seen": 3328744, "step": 17460 }, { "epoch": 9.077442827442827, "grad_norm": 0.7782925367355347, "learning_rate": 3.323434381080199e-05, "loss": 0.1502, "num_input_tokens_seen": 3329704, "step": 17465 }, { "epoch": 9.08004158004158, "grad_norm": 0.6436479687690735, "learning_rate": 3.3223636498312156e-05, "loss": 0.1167, "num_input_tokens_seen": 3330696, "step": 17470 }, { "epoch": 9.082640332640333, "grad_norm": 0.10772758722305298, "learning_rate": 3.321292749402432e-05, "loss": 0.1719, "num_input_tokens_seen": 3331720, "step": 17475 }, { "epoch": 9.085239085239085, "grad_norm": 0.4157848656177521, "learning_rate": 3.320221680014159e-05, "loss": 0.1537, "num_input_tokens_seen": 3332616, "step": 17480 }, { "epoch": 9.087837837837839, "grad_norm": 0.6804692149162292, "learning_rate": 3.3191504418867394e-05, "loss": 0.1487, "num_input_tokens_seen": 3333576, "step": 17485 }, { "epoch": 9.09043659043659, "grad_norm": 0.34795287251472473, "learning_rate": 3.318079035240554e-05, "loss": 0.2369, "num_input_tokens_seen": 3334536, "step": 17490 }, { "epoch": 9.093035343035343, "grad_norm": 0.6774426102638245, "learning_rate": 3.317007460296018e-05, "loss": 0.1188, "num_input_tokens_seen": 3335464, "step": 17495 }, { "epoch": 9.095634095634095, "grad_norm": 0.2415425330400467, "learning_rate": 3.315935717273576e-05, "loss": 0.0854, "num_input_tokens_seen": 3336456, "step": 17500 }, { "epoch": 9.098232848232849, "grad_norm": 0.4321959912776947, "learning_rate": 3.3148638063937144e-05, "loss": 0.1291, "num_input_tokens_seen": 3337480, "step": 17505 }, { "epoch": 9.1008316008316, "grad_norm": 0.5466830730438232, "learning_rate": 3.3137917278769486e-05, "loss": 0.0933, "num_input_tokens_seen": 3338408, "step": 17510 }, { "epoch": 9.103430353430353, "grad_norm": 0.9018737077713013, "learning_rate": 3.312719481943832e-05, "loss": 0.1352, "num_input_tokens_seen": 3339400, "step": 17515 }, { "epoch": 9.106029106029107, "grad_norm": 0.9920639395713806, "learning_rate": 3.311647068814952e-05, "loss": 0.1225, "num_input_tokens_seen": 3340360, "step": 17520 }, { "epoch": 9.108627858627859, "grad_norm": 0.25517842173576355, "learning_rate": 3.310574488710928e-05, "loss": 0.2097, "num_input_tokens_seen": 3341320, "step": 17525 }, { "epoch": 9.111226611226611, "grad_norm": 0.27561068534851074, "learning_rate": 3.309501741852415e-05, "loss": 0.0592, "num_input_tokens_seen": 3342312, "step": 17530 }, { "epoch": 9.113825363825363, "grad_norm": 0.12237069755792618, "learning_rate": 3.308428828460104e-05, "loss": 0.0537, "num_input_tokens_seen": 3343176, "step": 17535 }, { "epoch": 9.116424116424117, "grad_norm": 0.6387020945549011, "learning_rate": 3.3073557487547174e-05, "loss": 0.119, "num_input_tokens_seen": 3344136, "step": 17540 }, { "epoch": 9.119022869022869, "grad_norm": 1.8911802768707275, "learning_rate": 3.3062825029570143e-05, "loss": 0.1442, "num_input_tokens_seen": 3345064, "step": 17545 }, { "epoch": 9.121621621621621, "grad_norm": 0.18017442524433136, "learning_rate": 3.305209091287787e-05, "loss": 0.1681, "num_input_tokens_seen": 3345960, "step": 17550 }, { "epoch": 9.124220374220375, "grad_norm": 0.3660745918750763, "learning_rate": 3.3041355139678615e-05, "loss": 0.1448, "num_input_tokens_seen": 3346952, "step": 17555 }, { "epoch": 9.126819126819127, "grad_norm": 0.546504557132721, "learning_rate": 3.303061771218098e-05, "loss": 0.1069, "num_input_tokens_seen": 3347912, "step": 17560 }, { "epoch": 9.129417879417879, "grad_norm": 0.1384969800710678, "learning_rate": 3.301987863259392e-05, "loss": 0.2291, "num_input_tokens_seen": 3348872, "step": 17565 }, { "epoch": 9.132016632016631, "grad_norm": 0.25516337156295776, "learning_rate": 3.30091379031267e-05, "loss": 0.1494, "num_input_tokens_seen": 3349928, "step": 17570 }, { "epoch": 9.134615384615385, "grad_norm": 0.6480681300163269, "learning_rate": 3.299839552598896e-05, "loss": 0.0963, "num_input_tokens_seen": 3350984, "step": 17575 }, { "epoch": 9.137214137214137, "grad_norm": 1.5334111452102661, "learning_rate": 3.298765150339066e-05, "loss": 0.159, "num_input_tokens_seen": 3351944, "step": 17580 }, { "epoch": 9.13981288981289, "grad_norm": 0.414947509765625, "learning_rate": 3.29769058375421e-05, "loss": 0.1463, "num_input_tokens_seen": 3352872, "step": 17585 }, { "epoch": 9.142411642411643, "grad_norm": 0.196042999625206, "learning_rate": 3.296615853065391e-05, "loss": 0.2627, "num_input_tokens_seen": 3353768, "step": 17590 }, { "epoch": 9.145010395010395, "grad_norm": 0.12184017151594162, "learning_rate": 3.295540958493709e-05, "loss": 0.0705, "num_input_tokens_seen": 3354760, "step": 17595 }, { "epoch": 9.147609147609147, "grad_norm": 0.5246138572692871, "learning_rate": 3.294465900260294e-05, "loss": 0.1045, "num_input_tokens_seen": 3355720, "step": 17600 }, { "epoch": 9.1502079002079, "grad_norm": 0.22939343750476837, "learning_rate": 3.293390678586311e-05, "loss": 0.094, "num_input_tokens_seen": 3356648, "step": 17605 }, { "epoch": 9.152806652806653, "grad_norm": 0.37524649500846863, "learning_rate": 3.292315293692958e-05, "loss": 0.1121, "num_input_tokens_seen": 3357608, "step": 17610 }, { "epoch": 9.155405405405405, "grad_norm": 0.7668479084968567, "learning_rate": 3.29123974580147e-05, "loss": 0.1996, "num_input_tokens_seen": 3358568, "step": 17615 }, { "epoch": 9.158004158004157, "grad_norm": 0.7191494107246399, "learning_rate": 3.2901640351331106e-05, "loss": 0.1175, "num_input_tokens_seen": 3359496, "step": 17620 }, { "epoch": 9.160602910602911, "grad_norm": 0.3071810305118561, "learning_rate": 3.289088161909178e-05, "loss": 0.0961, "num_input_tokens_seen": 3360488, "step": 17625 }, { "epoch": 9.163201663201663, "grad_norm": 0.49665823578834534, "learning_rate": 3.288012126351008e-05, "loss": 0.2271, "num_input_tokens_seen": 3361480, "step": 17630 }, { "epoch": 9.165800415800415, "grad_norm": 0.2570871114730835, "learning_rate": 3.286935928679965e-05, "loss": 0.1676, "num_input_tokens_seen": 3362472, "step": 17635 }, { "epoch": 9.16839916839917, "grad_norm": 0.3219703137874603, "learning_rate": 3.285859569117449e-05, "loss": 0.081, "num_input_tokens_seen": 3363432, "step": 17640 }, { "epoch": 9.170997920997921, "grad_norm": 1.7423663139343262, "learning_rate": 3.2847830478848924e-05, "loss": 0.1268, "num_input_tokens_seen": 3364392, "step": 17645 }, { "epoch": 9.173596673596673, "grad_norm": 0.33858147263526917, "learning_rate": 3.283706365203762e-05, "loss": 0.1613, "num_input_tokens_seen": 3365352, "step": 17650 }, { "epoch": 9.176195426195425, "grad_norm": 0.6322495341300964, "learning_rate": 3.282629521295556e-05, "loss": 0.0615, "num_input_tokens_seen": 3366344, "step": 17655 }, { "epoch": 9.17879417879418, "grad_norm": 0.3447180688381195, "learning_rate": 3.2815525163818076e-05, "loss": 0.0475, "num_input_tokens_seen": 3367272, "step": 17660 }, { "epoch": 9.181392931392931, "grad_norm": 0.7725666165351868, "learning_rate": 3.2804753506840825e-05, "loss": 0.2053, "num_input_tokens_seen": 3368232, "step": 17665 }, { "epoch": 9.183991683991684, "grad_norm": 0.4011289179325104, "learning_rate": 3.279398024423978e-05, "loss": 0.0669, "num_input_tokens_seen": 3369192, "step": 17670 }, { "epoch": 9.186590436590437, "grad_norm": 0.793490469455719, "learning_rate": 3.278320537823128e-05, "loss": 0.1573, "num_input_tokens_seen": 3370184, "step": 17675 }, { "epoch": 9.18918918918919, "grad_norm": 0.08895941823720932, "learning_rate": 3.277242891103195e-05, "loss": 0.1671, "num_input_tokens_seen": 3371112, "step": 17680 }, { "epoch": 9.191787941787942, "grad_norm": 0.33932968974113464, "learning_rate": 3.2761650844858795e-05, "loss": 0.1244, "num_input_tokens_seen": 3372168, "step": 17685 }, { "epoch": 9.194386694386694, "grad_norm": 1.098922610282898, "learning_rate": 3.275087118192908e-05, "loss": 0.0873, "num_input_tokens_seen": 3373032, "step": 17690 }, { "epoch": 9.196985446985448, "grad_norm": 0.17577888071537018, "learning_rate": 3.274008992446047e-05, "loss": 0.1096, "num_input_tokens_seen": 3374024, "step": 17695 }, { "epoch": 9.1995841995842, "grad_norm": 1.106675386428833, "learning_rate": 3.272930707467092e-05, "loss": 0.1416, "num_input_tokens_seen": 3375048, "step": 17700 }, { "epoch": 9.202182952182952, "grad_norm": 1.6069000959396362, "learning_rate": 3.271852263477871e-05, "loss": 0.1424, "num_input_tokens_seen": 3375976, "step": 17705 }, { "epoch": 9.204781704781706, "grad_norm": 0.6765317916870117, "learning_rate": 3.270773660700246e-05, "loss": 0.1175, "num_input_tokens_seen": 3376840, "step": 17710 }, { "epoch": 9.207380457380458, "grad_norm": 1.3236202001571655, "learning_rate": 3.2696948993561116e-05, "loss": 0.1851, "num_input_tokens_seen": 3377864, "step": 17715 }, { "epoch": 9.20997920997921, "grad_norm": 0.8483890891075134, "learning_rate": 3.268615979667395e-05, "loss": 0.18, "num_input_tokens_seen": 3378792, "step": 17720 }, { "epoch": 9.212577962577962, "grad_norm": 0.7717626094818115, "learning_rate": 3.267536901856055e-05, "loss": 0.1232, "num_input_tokens_seen": 3379784, "step": 17725 }, { "epoch": 9.215176715176716, "grad_norm": 0.4944022297859192, "learning_rate": 3.266457666144084e-05, "loss": 0.1283, "num_input_tokens_seen": 3380808, "step": 17730 }, { "epoch": 9.217775467775468, "grad_norm": 0.17363765835762024, "learning_rate": 3.265378272753506e-05, "loss": 0.0753, "num_input_tokens_seen": 3381736, "step": 17735 }, { "epoch": 9.22037422037422, "grad_norm": 0.8620370626449585, "learning_rate": 3.2642987219063784e-05, "loss": 0.1592, "num_input_tokens_seen": 3382696, "step": 17740 }, { "epoch": 9.222972972972974, "grad_norm": 1.0446020364761353, "learning_rate": 3.263219013824791e-05, "loss": 0.1255, "num_input_tokens_seen": 3383720, "step": 17745 }, { "epoch": 9.225571725571726, "grad_norm": 1.1531513929367065, "learning_rate": 3.262139148730865e-05, "loss": 0.1614, "num_input_tokens_seen": 3384712, "step": 17750 }, { "epoch": 9.228170478170478, "grad_norm": 0.5222643613815308, "learning_rate": 3.261059126846754e-05, "loss": 0.0984, "num_input_tokens_seen": 3385704, "step": 17755 }, { "epoch": 9.23076923076923, "grad_norm": 0.8479136228561401, "learning_rate": 3.259978948394644e-05, "loss": 0.1256, "num_input_tokens_seen": 3386632, "step": 17760 }, { "epoch": 9.233367983367984, "grad_norm": 0.17259745299816132, "learning_rate": 3.2588986135967556e-05, "loss": 0.2337, "num_input_tokens_seen": 3387624, "step": 17765 }, { "epoch": 9.235966735966736, "grad_norm": 0.36218759417533875, "learning_rate": 3.257818122675336e-05, "loss": 0.1051, "num_input_tokens_seen": 3388584, "step": 17770 }, { "epoch": 9.238565488565488, "grad_norm": 0.49219170212745667, "learning_rate": 3.256737475852672e-05, "loss": 0.0865, "num_input_tokens_seen": 3389608, "step": 17775 }, { "epoch": 9.241164241164242, "grad_norm": 0.5760589838027954, "learning_rate": 3.255656673351074e-05, "loss": 0.137, "num_input_tokens_seen": 3390568, "step": 17780 }, { "epoch": 9.243762993762994, "grad_norm": 0.627931535243988, "learning_rate": 3.2545757153928924e-05, "loss": 0.0944, "num_input_tokens_seen": 3391528, "step": 17785 }, { "epoch": 9.246361746361746, "grad_norm": 2.1864922046661377, "learning_rate": 3.253494602200504e-05, "loss": 0.2513, "num_input_tokens_seen": 3392488, "step": 17790 }, { "epoch": 9.248960498960498, "grad_norm": 0.18430639803409576, "learning_rate": 3.2524133339963205e-05, "loss": 0.072, "num_input_tokens_seen": 3393448, "step": 17795 }, { "epoch": 9.251559251559252, "grad_norm": 0.5232694149017334, "learning_rate": 3.251331911002784e-05, "loss": 0.1151, "num_input_tokens_seen": 3394280, "step": 17800 }, { "epoch": 9.254158004158004, "grad_norm": 0.30007150769233704, "learning_rate": 3.250250333442369e-05, "loss": 0.1235, "num_input_tokens_seen": 3395240, "step": 17805 }, { "epoch": 9.256756756756756, "grad_norm": 0.1265772432088852, "learning_rate": 3.2491686015375824e-05, "loss": 0.048, "num_input_tokens_seen": 3396168, "step": 17810 }, { "epoch": 9.25935550935551, "grad_norm": 1.2103136777877808, "learning_rate": 3.248086715510962e-05, "loss": 0.1326, "num_input_tokens_seen": 3397160, "step": 17815 }, { "epoch": 9.261954261954262, "grad_norm": 0.7765852808952332, "learning_rate": 3.247004675585077e-05, "loss": 0.2475, "num_input_tokens_seen": 3398120, "step": 17820 }, { "epoch": 9.264553014553014, "grad_norm": 0.41196003556251526, "learning_rate": 3.245922481982528e-05, "loss": 0.1126, "num_input_tokens_seen": 3399080, "step": 17825 }, { "epoch": 9.267151767151766, "grad_norm": 0.6442327499389648, "learning_rate": 3.244840134925951e-05, "loss": 0.1118, "num_input_tokens_seen": 3399976, "step": 17830 }, { "epoch": 9.26975051975052, "grad_norm": 0.8682807087898254, "learning_rate": 3.243757634638008e-05, "loss": 0.1313, "num_input_tokens_seen": 3400904, "step": 17835 }, { "epoch": 9.272349272349272, "grad_norm": 0.31034478545188904, "learning_rate": 3.242674981341396e-05, "loss": 0.1395, "num_input_tokens_seen": 3401800, "step": 17840 }, { "epoch": 9.274948024948024, "grad_norm": 1.3951495885849, "learning_rate": 3.241592175258842e-05, "loss": 0.2498, "num_input_tokens_seen": 3402728, "step": 17845 }, { "epoch": 9.277546777546778, "grad_norm": 0.8686874508857727, "learning_rate": 3.2405092166131056e-05, "loss": 0.1374, "num_input_tokens_seen": 3403720, "step": 17850 }, { "epoch": 9.28014553014553, "grad_norm": 1.2827051877975464, "learning_rate": 3.239426105626977e-05, "loss": 0.1649, "num_input_tokens_seen": 3404744, "step": 17855 }, { "epoch": 9.282744282744282, "grad_norm": 0.48737114667892456, "learning_rate": 3.2383428425232775e-05, "loss": 0.0998, "num_input_tokens_seen": 3405704, "step": 17860 }, { "epoch": 9.285343035343036, "grad_norm": 1.6223268508911133, "learning_rate": 3.2372594275248614e-05, "loss": 0.3014, "num_input_tokens_seen": 3406632, "step": 17865 }, { "epoch": 9.287941787941788, "grad_norm": 1.0991151332855225, "learning_rate": 3.2361758608546116e-05, "loss": 0.2003, "num_input_tokens_seen": 3407560, "step": 17870 }, { "epoch": 9.29054054054054, "grad_norm": 0.19942209124565125, "learning_rate": 3.2350921427354454e-05, "loss": 0.2376, "num_input_tokens_seen": 3408456, "step": 17875 }, { "epoch": 9.293139293139292, "grad_norm": 0.22439269721508026, "learning_rate": 3.2340082733903074e-05, "loss": 0.182, "num_input_tokens_seen": 3409480, "step": 17880 }, { "epoch": 9.295738045738046, "grad_norm": 0.3495723307132721, "learning_rate": 3.232924253042175e-05, "loss": 0.1575, "num_input_tokens_seen": 3410376, "step": 17885 }, { "epoch": 9.298336798336798, "grad_norm": 0.3664705157279968, "learning_rate": 3.23184008191406e-05, "loss": 0.1425, "num_input_tokens_seen": 3411336, "step": 17890 }, { "epoch": 9.30093555093555, "grad_norm": 0.23807628452777863, "learning_rate": 3.230755760228999e-05, "loss": 0.0681, "num_input_tokens_seen": 3412296, "step": 17895 }, { "epoch": 9.303534303534304, "grad_norm": 1.7646645307540894, "learning_rate": 3.229671288210065e-05, "loss": 0.0898, "num_input_tokens_seen": 3413224, "step": 17900 }, { "epoch": 9.306133056133056, "grad_norm": 0.339089035987854, "learning_rate": 3.228586666080359e-05, "loss": 0.1016, "num_input_tokens_seen": 3414184, "step": 17905 }, { "epoch": 9.308731808731808, "grad_norm": 0.6491122841835022, "learning_rate": 3.2275018940630136e-05, "loss": 0.1508, "num_input_tokens_seen": 3415112, "step": 17910 }, { "epoch": 9.31133056133056, "grad_norm": 0.7964248657226562, "learning_rate": 3.2264169723811924e-05, "loss": 0.1268, "num_input_tokens_seen": 3416008, "step": 17915 }, { "epoch": 9.313929313929314, "grad_norm": 0.9588674902915955, "learning_rate": 3.225331901258089e-05, "loss": 0.1807, "num_input_tokens_seen": 3416936, "step": 17920 }, { "epoch": 9.316528066528067, "grad_norm": 0.7545025944709778, "learning_rate": 3.224246680916929e-05, "loss": 0.0868, "num_input_tokens_seen": 3417960, "step": 17925 }, { "epoch": 9.319126819126819, "grad_norm": 0.8830640912055969, "learning_rate": 3.223161311580968e-05, "loss": 0.162, "num_input_tokens_seen": 3418984, "step": 17930 }, { "epoch": 9.321725571725572, "grad_norm": 1.2653526067733765, "learning_rate": 3.2220757934734926e-05, "loss": 0.1103, "num_input_tokens_seen": 3420008, "step": 17935 }, { "epoch": 9.324324324324325, "grad_norm": 0.23942317068576813, "learning_rate": 3.220990126817819e-05, "loss": 0.1268, "num_input_tokens_seen": 3421000, "step": 17940 }, { "epoch": 9.326923076923077, "grad_norm": 0.7801417708396912, "learning_rate": 3.219904311837296e-05, "loss": 0.1243, "num_input_tokens_seen": 3421960, "step": 17945 }, { "epoch": 9.329521829521829, "grad_norm": 1.1243635416030884, "learning_rate": 3.2188183487552996e-05, "loss": 0.2079, "num_input_tokens_seen": 3422920, "step": 17950 }, { "epoch": 9.332120582120583, "grad_norm": 0.5223261713981628, "learning_rate": 3.2177322377952404e-05, "loss": 0.1228, "num_input_tokens_seen": 3423816, "step": 17955 }, { "epoch": 9.334719334719335, "grad_norm": 0.9224863648414612, "learning_rate": 3.216645979180555e-05, "loss": 0.183, "num_input_tokens_seen": 3424744, "step": 17960 }, { "epoch": 9.337318087318087, "grad_norm": 1.205736756324768, "learning_rate": 3.215559573134715e-05, "loss": 0.1265, "num_input_tokens_seen": 3425672, "step": 17965 }, { "epoch": 9.33991683991684, "grad_norm": 1.513069748878479, "learning_rate": 3.214473019881219e-05, "loss": 0.3092, "num_input_tokens_seen": 3426664, "step": 17970 }, { "epoch": 9.342515592515593, "grad_norm": 0.5667880773544312, "learning_rate": 3.2133863196435966e-05, "loss": 0.0684, "num_input_tokens_seen": 3427560, "step": 17975 }, { "epoch": 9.345114345114345, "grad_norm": 0.8853113055229187, "learning_rate": 3.2122994726454095e-05, "loss": 0.0614, "num_input_tokens_seen": 3428456, "step": 17980 }, { "epoch": 9.347713097713097, "grad_norm": 0.6619320511817932, "learning_rate": 3.211212479110245e-05, "loss": 0.0813, "num_input_tokens_seen": 3429448, "step": 17985 }, { "epoch": 9.35031185031185, "grad_norm": 1.3096309900283813, "learning_rate": 3.210125339261727e-05, "loss": 0.1392, "num_input_tokens_seen": 3430312, "step": 17990 }, { "epoch": 9.352910602910603, "grad_norm": 0.8585634231567383, "learning_rate": 3.209038053323503e-05, "loss": 0.1904, "num_input_tokens_seen": 3431304, "step": 17995 }, { "epoch": 9.355509355509355, "grad_norm": 1.7967878580093384, "learning_rate": 3.207950621519256e-05, "loss": 0.1588, "num_input_tokens_seen": 3432360, "step": 18000 }, { "epoch": 9.358108108108109, "grad_norm": 0.24711009860038757, "learning_rate": 3.206863044072695e-05, "loss": 0.1022, "num_input_tokens_seen": 3433288, "step": 18005 }, { "epoch": 9.36070686070686, "grad_norm": 0.29006943106651306, "learning_rate": 3.2057753212075625e-05, "loss": 0.2788, "num_input_tokens_seen": 3434280, "step": 18010 }, { "epoch": 9.363305613305613, "grad_norm": 0.09830750524997711, "learning_rate": 3.204687453147627e-05, "loss": 0.0495, "num_input_tokens_seen": 3435176, "step": 18015 }, { "epoch": 9.365904365904367, "grad_norm": 0.18990422785282135, "learning_rate": 3.20359944011669e-05, "loss": 0.1726, "num_input_tokens_seen": 3436168, "step": 18020 }, { "epoch": 9.368503118503119, "grad_norm": 1.4233505725860596, "learning_rate": 3.202511282338581e-05, "loss": 0.1436, "num_input_tokens_seen": 3437128, "step": 18025 }, { "epoch": 9.371101871101871, "grad_norm": 0.34582096338272095, "learning_rate": 3.2014229800371605e-05, "loss": 0.1826, "num_input_tokens_seen": 3438120, "step": 18030 }, { "epoch": 9.373700623700623, "grad_norm": 1.290401577949524, "learning_rate": 3.200334533436318e-05, "loss": 0.1971, "num_input_tokens_seen": 3438984, "step": 18035 }, { "epoch": 9.376299376299377, "grad_norm": 0.7639732956886292, "learning_rate": 3.199245942759973e-05, "loss": 0.1623, "num_input_tokens_seen": 3439944, "step": 18040 }, { "epoch": 9.378898128898129, "grad_norm": 0.1432519108057022, "learning_rate": 3.1981572082320756e-05, "loss": 0.1572, "num_input_tokens_seen": 3440872, "step": 18045 }, { "epoch": 9.381496881496881, "grad_norm": 0.1262863427400589, "learning_rate": 3.1970683300766026e-05, "loss": 0.1174, "num_input_tokens_seen": 3441864, "step": 18050 }, { "epoch": 9.384095634095633, "grad_norm": 0.5180023908615112, "learning_rate": 3.195979308517563e-05, "loss": 0.1541, "num_input_tokens_seen": 3442792, "step": 18055 }, { "epoch": 9.386694386694387, "grad_norm": 0.8411284685134888, "learning_rate": 3.1948901437789945e-05, "loss": 0.1476, "num_input_tokens_seen": 3443784, "step": 18060 }, { "epoch": 9.38929313929314, "grad_norm": 0.44148147106170654, "learning_rate": 3.1938008360849636e-05, "loss": 0.1084, "num_input_tokens_seen": 3444808, "step": 18065 }, { "epoch": 9.391891891891891, "grad_norm": 0.7973974943161011, "learning_rate": 3.192711385659569e-05, "loss": 0.1165, "num_input_tokens_seen": 3445768, "step": 18070 }, { "epoch": 9.394490644490645, "grad_norm": 0.1764523833990097, "learning_rate": 3.1916217927269333e-05, "loss": 0.0726, "num_input_tokens_seen": 3446664, "step": 18075 }, { "epoch": 9.397089397089397, "grad_norm": 1.1473257541656494, "learning_rate": 3.190532057511215e-05, "loss": 0.1499, "num_input_tokens_seen": 3447560, "step": 18080 }, { "epoch": 9.39968814968815, "grad_norm": 0.711474597454071, "learning_rate": 3.189442180236597e-05, "loss": 0.1588, "num_input_tokens_seen": 3448488, "step": 18085 }, { "epoch": 9.402286902286903, "grad_norm": 0.4423305094242096, "learning_rate": 3.188352161127292e-05, "loss": 0.2143, "num_input_tokens_seen": 3449512, "step": 18090 }, { "epoch": 9.404885654885655, "grad_norm": 0.6081171631813049, "learning_rate": 3.187262000407544e-05, "loss": 0.1235, "num_input_tokens_seen": 3450504, "step": 18095 }, { "epoch": 9.407484407484407, "grad_norm": 0.6215254068374634, "learning_rate": 3.186171698301626e-05, "loss": 0.3068, "num_input_tokens_seen": 3451400, "step": 18100 }, { "epoch": 9.41008316008316, "grad_norm": 0.6655709147453308, "learning_rate": 3.185081255033838e-05, "loss": 0.1994, "num_input_tokens_seen": 3452456, "step": 18105 }, { "epoch": 9.412681912681913, "grad_norm": 0.920810878276825, "learning_rate": 3.1839906708285105e-05, "loss": 0.0704, "num_input_tokens_seen": 3453448, "step": 18110 }, { "epoch": 9.415280665280665, "grad_norm": 0.4167174696922302, "learning_rate": 3.182899945910003e-05, "loss": 0.1494, "num_input_tokens_seen": 3454504, "step": 18115 }, { "epoch": 9.417879417879417, "grad_norm": 0.7744794487953186, "learning_rate": 3.181809080502702e-05, "loss": 0.1001, "num_input_tokens_seen": 3455464, "step": 18120 }, { "epoch": 9.420478170478171, "grad_norm": 0.12459004670381546, "learning_rate": 3.180718074831026e-05, "loss": 0.0815, "num_input_tokens_seen": 3456392, "step": 18125 }, { "epoch": 9.423076923076923, "grad_norm": 0.6958101391792297, "learning_rate": 3.17962692911942e-05, "loss": 0.0885, "num_input_tokens_seen": 3457384, "step": 18130 }, { "epoch": 9.425675675675675, "grad_norm": 0.31054818630218506, "learning_rate": 3.178535643592361e-05, "loss": 0.1362, "num_input_tokens_seen": 3458408, "step": 18135 }, { "epoch": 9.428274428274428, "grad_norm": 1.065309762954712, "learning_rate": 3.177444218474348e-05, "loss": 0.3692, "num_input_tokens_seen": 3459400, "step": 18140 }, { "epoch": 9.430873180873181, "grad_norm": 1.0077540874481201, "learning_rate": 3.176352653989918e-05, "loss": 0.1708, "num_input_tokens_seen": 3460424, "step": 18145 }, { "epoch": 9.433471933471933, "grad_norm": 0.09870833903551102, "learning_rate": 3.175260950363629e-05, "loss": 0.1334, "num_input_tokens_seen": 3461320, "step": 18150 }, { "epoch": 9.436070686070686, "grad_norm": 0.8046303987503052, "learning_rate": 3.174169107820071e-05, "loss": 0.107, "num_input_tokens_seen": 3462216, "step": 18155 }, { "epoch": 9.43866943866944, "grad_norm": 0.39272361993789673, "learning_rate": 3.173077126583862e-05, "loss": 0.1257, "num_input_tokens_seen": 3463112, "step": 18160 }, { "epoch": 9.441268191268192, "grad_norm": 0.22595012187957764, "learning_rate": 3.171985006879649e-05, "loss": 0.181, "num_input_tokens_seen": 3464104, "step": 18165 }, { "epoch": 9.443866943866944, "grad_norm": 0.2823699414730072, "learning_rate": 3.170892748932106e-05, "loss": 0.165, "num_input_tokens_seen": 3465032, "step": 18170 }, { "epoch": 9.446465696465696, "grad_norm": 0.5109095573425293, "learning_rate": 3.1698003529659374e-05, "loss": 0.211, "num_input_tokens_seen": 3465992, "step": 18175 }, { "epoch": 9.44906444906445, "grad_norm": 0.8949924111366272, "learning_rate": 3.168707819205875e-05, "loss": 0.1096, "num_input_tokens_seen": 3466984, "step": 18180 }, { "epoch": 9.451663201663202, "grad_norm": 0.31565263867378235, "learning_rate": 3.167615147876679e-05, "loss": 0.0805, "num_input_tokens_seen": 3467880, "step": 18185 }, { "epoch": 9.454261954261954, "grad_norm": 1.0394939184188843, "learning_rate": 3.166522339203138e-05, "loss": 0.086, "num_input_tokens_seen": 3468808, "step": 18190 }, { "epoch": 9.456860706860708, "grad_norm": 0.23921747505664825, "learning_rate": 3.1654293934100684e-05, "loss": 0.0719, "num_input_tokens_seen": 3469736, "step": 18195 }, { "epoch": 9.45945945945946, "grad_norm": 0.5626555681228638, "learning_rate": 3.164336310722315e-05, "loss": 0.1144, "num_input_tokens_seen": 3470728, "step": 18200 }, { "epoch": 9.462058212058212, "grad_norm": 0.35199958086013794, "learning_rate": 3.163243091364752e-05, "loss": 0.088, "num_input_tokens_seen": 3471688, "step": 18205 }, { "epoch": 9.464656964656964, "grad_norm": 0.34392237663269043, "learning_rate": 3.16214973556228e-05, "loss": 0.0752, "num_input_tokens_seen": 3472648, "step": 18210 }, { "epoch": 9.467255717255718, "grad_norm": 0.2835848331451416, "learning_rate": 3.161056243539828e-05, "loss": 0.0478, "num_input_tokens_seen": 3473608, "step": 18215 }, { "epoch": 9.46985446985447, "grad_norm": 0.5137467384338379, "learning_rate": 3.159962615522354e-05, "loss": 0.1596, "num_input_tokens_seen": 3474568, "step": 18220 }, { "epoch": 9.472453222453222, "grad_norm": 1.1978113651275635, "learning_rate": 3.158868851734843e-05, "loss": 0.1172, "num_input_tokens_seen": 3475464, "step": 18225 }, { "epoch": 9.475051975051976, "grad_norm": 1.9151722192764282, "learning_rate": 3.157774952402309e-05, "loss": 0.2301, "num_input_tokens_seen": 3476456, "step": 18230 }, { "epoch": 9.477650727650728, "grad_norm": 0.2850181758403778, "learning_rate": 3.156680917749791e-05, "loss": 0.0779, "num_input_tokens_seen": 3477384, "step": 18235 }, { "epoch": 9.48024948024948, "grad_norm": 0.22608934342861176, "learning_rate": 3.155586748002362e-05, "loss": 0.1464, "num_input_tokens_seen": 3478280, "step": 18240 }, { "epoch": 9.482848232848234, "grad_norm": 0.46692967414855957, "learning_rate": 3.154492443385114e-05, "loss": 0.092, "num_input_tokens_seen": 3479208, "step": 18245 }, { "epoch": 9.485446985446986, "grad_norm": 0.6761794686317444, "learning_rate": 3.1533980041231746e-05, "loss": 0.1133, "num_input_tokens_seen": 3480136, "step": 18250 }, { "epoch": 9.488045738045738, "grad_norm": 0.11078780889511108, "learning_rate": 3.1523034304416956e-05, "loss": 0.126, "num_input_tokens_seen": 3481096, "step": 18255 }, { "epoch": 9.49064449064449, "grad_norm": 0.19523437321186066, "learning_rate": 3.151208722565856e-05, "loss": 0.1154, "num_input_tokens_seen": 3482056, "step": 18260 }, { "epoch": 9.493243243243244, "grad_norm": 0.23887352645397186, "learning_rate": 3.150113880720864e-05, "loss": 0.0671, "num_input_tokens_seen": 3483048, "step": 18265 }, { "epoch": 9.495841995841996, "grad_norm": 0.8686093688011169, "learning_rate": 3.149018905131955e-05, "loss": 0.1905, "num_input_tokens_seen": 3483976, "step": 18270 }, { "epoch": 9.498440748440748, "grad_norm": 0.12191438674926758, "learning_rate": 3.147923796024391e-05, "loss": 0.1196, "num_input_tokens_seen": 3484968, "step": 18275 }, { "epoch": 9.5010395010395, "grad_norm": 0.36425691843032837, "learning_rate": 3.146828553623462e-05, "loss": 0.086, "num_input_tokens_seen": 3485896, "step": 18280 }, { "epoch": 9.503638253638254, "grad_norm": 0.07379370927810669, "learning_rate": 3.145733178154486e-05, "loss": 0.0988, "num_input_tokens_seen": 3486792, "step": 18285 }, { "epoch": 9.506237006237006, "grad_norm": 0.47880685329437256, "learning_rate": 3.144637669842807e-05, "loss": 0.1803, "num_input_tokens_seen": 3487688, "step": 18290 }, { "epoch": 9.508835758835758, "grad_norm": 0.46903660893440247, "learning_rate": 3.143542028913797e-05, "loss": 0.1397, "num_input_tokens_seen": 3488680, "step": 18295 }, { "epoch": 9.511434511434512, "grad_norm": 0.22133220732212067, "learning_rate": 3.142446255592857e-05, "loss": 0.1728, "num_input_tokens_seen": 3489672, "step": 18300 }, { "epoch": 9.514033264033264, "grad_norm": 0.6568641066551208, "learning_rate": 3.141350350105413e-05, "loss": 0.1255, "num_input_tokens_seen": 3490664, "step": 18305 }, { "epoch": 9.516632016632016, "grad_norm": 0.9814249873161316, "learning_rate": 3.140254312676919e-05, "loss": 0.1355, "num_input_tokens_seen": 3491624, "step": 18310 }, { "epoch": 9.51923076923077, "grad_norm": 0.11251785606145859, "learning_rate": 3.139158143532854e-05, "loss": 0.1786, "num_input_tokens_seen": 3492616, "step": 18315 }, { "epoch": 9.521829521829522, "grad_norm": 0.24443283677101135, "learning_rate": 3.138061842898729e-05, "loss": 0.1207, "num_input_tokens_seen": 3493480, "step": 18320 }, { "epoch": 9.524428274428274, "grad_norm": 1.000644564628601, "learning_rate": 3.136965411000077e-05, "loss": 0.2174, "num_input_tokens_seen": 3494408, "step": 18325 }, { "epoch": 9.527027027027026, "grad_norm": 0.761117160320282, "learning_rate": 3.1358688480624616e-05, "loss": 0.2529, "num_input_tokens_seen": 3495400, "step": 18330 }, { "epoch": 9.52962577962578, "grad_norm": 0.1686885505914688, "learning_rate": 3.134772154311471e-05, "loss": 0.1554, "num_input_tokens_seen": 3496328, "step": 18335 }, { "epoch": 9.532224532224532, "grad_norm": 1.1565327644348145, "learning_rate": 3.133675329972722e-05, "loss": 0.1901, "num_input_tokens_seen": 3497288, "step": 18340 }, { "epoch": 9.534823284823284, "grad_norm": 0.6158298254013062, "learning_rate": 3.132578375271858e-05, "loss": 0.2326, "num_input_tokens_seen": 3498248, "step": 18345 }, { "epoch": 9.537422037422038, "grad_norm": 0.3551133871078491, "learning_rate": 3.1314812904345466e-05, "loss": 0.1498, "num_input_tokens_seen": 3499176, "step": 18350 }, { "epoch": 9.54002079002079, "grad_norm": 0.75063556432724, "learning_rate": 3.1303840756864866e-05, "loss": 0.2542, "num_input_tokens_seen": 3500168, "step": 18355 }, { "epoch": 9.542619542619542, "grad_norm": 0.745955228805542, "learning_rate": 3.129286731253399e-05, "loss": 0.0539, "num_input_tokens_seen": 3501064, "step": 18360 }, { "epoch": 9.545218295218294, "grad_norm": 0.04668742045760155, "learning_rate": 3.128189257361036e-05, "loss": 0.0986, "num_input_tokens_seen": 3501992, "step": 18365 }, { "epoch": 9.547817047817048, "grad_norm": 0.10354524105787277, "learning_rate": 3.127091654235172e-05, "loss": 0.0886, "num_input_tokens_seen": 3502920, "step": 18370 }, { "epoch": 9.5504158004158, "grad_norm": 0.5113164782524109, "learning_rate": 3.1259939221016126e-05, "loss": 0.051, "num_input_tokens_seen": 3503784, "step": 18375 }, { "epoch": 9.553014553014552, "grad_norm": 0.7094444632530212, "learning_rate": 3.124896061186185e-05, "loss": 0.2669, "num_input_tokens_seen": 3504744, "step": 18380 }, { "epoch": 9.555613305613306, "grad_norm": 0.7367657423019409, "learning_rate": 3.1237980717147465e-05, "loss": 0.1065, "num_input_tokens_seen": 3505704, "step": 18385 }, { "epoch": 9.558212058212058, "grad_norm": 0.922308623790741, "learning_rate": 3.12269995391318e-05, "loss": 0.0728, "num_input_tokens_seen": 3506664, "step": 18390 }, { "epoch": 9.56081081081081, "grad_norm": 1.5945526361465454, "learning_rate": 3.1216017080073935e-05, "loss": 0.1173, "num_input_tokens_seen": 3507656, "step": 18395 }, { "epoch": 9.563409563409563, "grad_norm": 0.6920477747917175, "learning_rate": 3.120503334223323e-05, "loss": 0.1818, "num_input_tokens_seen": 3508552, "step": 18400 }, { "epoch": 9.566008316008316, "grad_norm": 1.364896297454834, "learning_rate": 3.1194048327869294e-05, "loss": 0.1368, "num_input_tokens_seen": 3509448, "step": 18405 }, { "epoch": 9.568607068607069, "grad_norm": 0.5512923002243042, "learning_rate": 3.1183062039242026e-05, "loss": 0.0879, "num_input_tokens_seen": 3510440, "step": 18410 }, { "epoch": 9.57120582120582, "grad_norm": 0.6643773913383484, "learning_rate": 3.1172074478611543e-05, "loss": 0.1341, "num_input_tokens_seen": 3511368, "step": 18415 }, { "epoch": 9.573804573804575, "grad_norm": 0.5062008500099182, "learning_rate": 3.1161085648238266e-05, "loss": 0.11, "num_input_tokens_seen": 3512392, "step": 18420 }, { "epoch": 9.576403326403327, "grad_norm": 0.46099328994750977, "learning_rate": 3.115009555038285e-05, "loss": 0.0867, "num_input_tokens_seen": 3513320, "step": 18425 }, { "epoch": 9.579002079002079, "grad_norm": 0.5510773658752441, "learning_rate": 3.113910418730623e-05, "loss": 0.1609, "num_input_tokens_seen": 3514344, "step": 18430 }, { "epoch": 9.58160083160083, "grad_norm": 0.259360134601593, "learning_rate": 3.112811156126958e-05, "loss": 0.0856, "num_input_tokens_seen": 3515304, "step": 18435 }, { "epoch": 9.584199584199585, "grad_norm": 0.5232740044593811, "learning_rate": 3.111711767453435e-05, "loss": 0.216, "num_input_tokens_seen": 3516264, "step": 18440 }, { "epoch": 9.586798336798337, "grad_norm": 0.5502287745475769, "learning_rate": 3.1106122529362236e-05, "loss": 0.1152, "num_input_tokens_seen": 3517192, "step": 18445 }, { "epoch": 9.589397089397089, "grad_norm": 0.5757089853286743, "learning_rate": 3.109512612801521e-05, "loss": 0.1672, "num_input_tokens_seen": 3518184, "step": 18450 }, { "epoch": 9.591995841995843, "grad_norm": 1.7423651218414307, "learning_rate": 3.10841284727555e-05, "loss": 0.1337, "num_input_tokens_seen": 3519112, "step": 18455 }, { "epoch": 9.594594594594595, "grad_norm": 0.38173386454582214, "learning_rate": 3.107312956584557e-05, "loss": 0.1111, "num_input_tokens_seen": 3520072, "step": 18460 }, { "epoch": 9.597193347193347, "grad_norm": 0.6754013895988464, "learning_rate": 3.1062129409548177e-05, "loss": 0.0877, "num_input_tokens_seen": 3521000, "step": 18465 }, { "epoch": 9.5997920997921, "grad_norm": 0.9117701649665833, "learning_rate": 3.10511280061263e-05, "loss": 0.2157, "num_input_tokens_seen": 3521992, "step": 18470 }, { "epoch": 9.602390852390853, "grad_norm": 0.09632015973329544, "learning_rate": 3.104012535784319e-05, "loss": 0.0815, "num_input_tokens_seen": 3523048, "step": 18475 }, { "epoch": 9.604989604989605, "grad_norm": 1.2124818563461304, "learning_rate": 3.102912146696236e-05, "loss": 0.1267, "num_input_tokens_seen": 3524008, "step": 18480 }, { "epoch": 9.607588357588357, "grad_norm": 0.3305533826351166, "learning_rate": 3.1018116335747574e-05, "loss": 0.1614, "num_input_tokens_seen": 3524968, "step": 18485 }, { "epoch": 9.61018711018711, "grad_norm": 0.2776567339897156, "learning_rate": 3.100710996646284e-05, "loss": 0.1641, "num_input_tokens_seen": 3525800, "step": 18490 }, { "epoch": 9.612785862785863, "grad_norm": 0.8585129380226135, "learning_rate": 3.0996102361372436e-05, "loss": 0.1515, "num_input_tokens_seen": 3526792, "step": 18495 }, { "epoch": 9.615384615384615, "grad_norm": 0.19723299145698547, "learning_rate": 3.098509352274089e-05, "loss": 0.1082, "num_input_tokens_seen": 3527752, "step": 18500 }, { "epoch": 9.617983367983367, "grad_norm": 0.31049492955207825, "learning_rate": 3.0974083452832994e-05, "loss": 0.1213, "num_input_tokens_seen": 3528712, "step": 18505 }, { "epoch": 9.620582120582121, "grad_norm": 0.3539816737174988, "learning_rate": 3.096307215391376e-05, "loss": 0.0815, "num_input_tokens_seen": 3529576, "step": 18510 }, { "epoch": 9.623180873180873, "grad_norm": 1.0021699666976929, "learning_rate": 3.095205962824848e-05, "loss": 0.1201, "num_input_tokens_seen": 3530504, "step": 18515 }, { "epoch": 9.625779625779625, "grad_norm": 0.6780235171318054, "learning_rate": 3.0941045878102696e-05, "loss": 0.1557, "num_input_tokens_seen": 3531496, "step": 18520 }, { "epoch": 9.628378378378379, "grad_norm": 1.326111912727356, "learning_rate": 3.0930030905742216e-05, "loss": 0.1488, "num_input_tokens_seen": 3532488, "step": 18525 }, { "epoch": 9.630977130977131, "grad_norm": 0.47981759905815125, "learning_rate": 3.091901471343305e-05, "loss": 0.1085, "num_input_tokens_seen": 3533448, "step": 18530 }, { "epoch": 9.633575883575883, "grad_norm": 0.30775052309036255, "learning_rate": 3.090799730344152e-05, "loss": 0.1849, "num_input_tokens_seen": 3534344, "step": 18535 }, { "epoch": 9.636174636174637, "grad_norm": 1.7411302328109741, "learning_rate": 3.0896978678034146e-05, "loss": 0.1787, "num_input_tokens_seen": 3535304, "step": 18540 }, { "epoch": 9.638773388773389, "grad_norm": 1.4299463033676147, "learning_rate": 3.088595883947773e-05, "loss": 0.2041, "num_input_tokens_seen": 3536296, "step": 18545 }, { "epoch": 9.641372141372141, "grad_norm": 1.393294095993042, "learning_rate": 3.087493779003933e-05, "loss": 0.2063, "num_input_tokens_seen": 3537320, "step": 18550 }, { "epoch": 9.643970893970893, "grad_norm": 0.5792320370674133, "learning_rate": 3.086391553198623e-05, "loss": 0.1856, "num_input_tokens_seen": 3538248, "step": 18555 }, { "epoch": 9.646569646569647, "grad_norm": 0.17538361251354218, "learning_rate": 3.0852892067585954e-05, "loss": 0.0643, "num_input_tokens_seen": 3539144, "step": 18560 }, { "epoch": 9.6491683991684, "grad_norm": 0.15672381222248077, "learning_rate": 3.0841867399106316e-05, "loss": 0.1318, "num_input_tokens_seen": 3540136, "step": 18565 }, { "epoch": 9.651767151767151, "grad_norm": 0.6324601173400879, "learning_rate": 3.083084152881534e-05, "loss": 0.1415, "num_input_tokens_seen": 3541096, "step": 18570 }, { "epoch": 9.654365904365905, "grad_norm": 0.5631384253501892, "learning_rate": 3.081981445898131e-05, "loss": 0.0489, "num_input_tokens_seen": 3541992, "step": 18575 }, { "epoch": 9.656964656964657, "grad_norm": 0.5948622226715088, "learning_rate": 3.080878619187276e-05, "loss": 0.1254, "num_input_tokens_seen": 3542984, "step": 18580 }, { "epoch": 9.65956340956341, "grad_norm": 0.2517915368080139, "learning_rate": 3.079775672975846e-05, "loss": 0.1185, "num_input_tokens_seen": 3543880, "step": 18585 }, { "epoch": 9.662162162162161, "grad_norm": 1.0813871622085571, "learning_rate": 3.078672607490746e-05, "loss": 0.1168, "num_input_tokens_seen": 3544840, "step": 18590 }, { "epoch": 9.664760914760915, "grad_norm": 0.28856202960014343, "learning_rate": 3.077569422958899e-05, "loss": 0.1639, "num_input_tokens_seen": 3545736, "step": 18595 }, { "epoch": 9.667359667359667, "grad_norm": 0.36845529079437256, "learning_rate": 3.0764661196072596e-05, "loss": 0.2261, "num_input_tokens_seen": 3546600, "step": 18600 }, { "epoch": 9.66995841995842, "grad_norm": 1.0595078468322754, "learning_rate": 3.075362697662802e-05, "loss": 0.1907, "num_input_tokens_seen": 3547592, "step": 18605 }, { "epoch": 9.672557172557173, "grad_norm": 0.8138142824172974, "learning_rate": 3.074259157352526e-05, "loss": 0.1376, "num_input_tokens_seen": 3548520, "step": 18610 }, { "epoch": 9.675155925155925, "grad_norm": 0.7944565415382385, "learning_rate": 3.073155498903457e-05, "loss": 0.18, "num_input_tokens_seen": 3549416, "step": 18615 }, { "epoch": 9.677754677754677, "grad_norm": 0.30103081464767456, "learning_rate": 3.072051722542643e-05, "loss": 0.0759, "num_input_tokens_seen": 3550280, "step": 18620 }, { "epoch": 9.68035343035343, "grad_norm": 0.19236557185649872, "learning_rate": 3.0709478284971585e-05, "loss": 0.0843, "num_input_tokens_seen": 3551208, "step": 18625 }, { "epoch": 9.682952182952183, "grad_norm": 0.3652462959289551, "learning_rate": 3.0698438169940994e-05, "loss": 0.0606, "num_input_tokens_seen": 3552168, "step": 18630 }, { "epoch": 9.685550935550935, "grad_norm": 0.6559436321258545, "learning_rate": 3.068739688260589e-05, "loss": 0.1822, "num_input_tokens_seen": 3553128, "step": 18635 }, { "epoch": 9.688149688149688, "grad_norm": 0.581523060798645, "learning_rate": 3.067635442523771e-05, "loss": 0.0997, "num_input_tokens_seen": 3553992, "step": 18640 }, { "epoch": 9.690748440748441, "grad_norm": 0.6566278338432312, "learning_rate": 3.066531080010815e-05, "loss": 0.1651, "num_input_tokens_seen": 3554984, "step": 18645 }, { "epoch": 9.693347193347194, "grad_norm": 0.8265030384063721, "learning_rate": 3.0654266009489164e-05, "loss": 0.0918, "num_input_tokens_seen": 3555944, "step": 18650 }, { "epoch": 9.695945945945946, "grad_norm": 0.4097770154476166, "learning_rate": 3.064322005565291e-05, "loss": 0.0872, "num_input_tokens_seen": 3556872, "step": 18655 }, { "epoch": 9.698544698544698, "grad_norm": 1.096205234527588, "learning_rate": 3.063217294087182e-05, "loss": 0.1591, "num_input_tokens_seen": 3557832, "step": 18660 }, { "epoch": 9.701143451143452, "grad_norm": 0.4475817084312439, "learning_rate": 3.062112466741854e-05, "loss": 0.154, "num_input_tokens_seen": 3558760, "step": 18665 }, { "epoch": 9.703742203742204, "grad_norm": 0.44708216190338135, "learning_rate": 3.0610075237565974e-05, "loss": 0.2047, "num_input_tokens_seen": 3559752, "step": 18670 }, { "epoch": 9.706340956340956, "grad_norm": 0.6794671416282654, "learning_rate": 3.0599024653587226e-05, "loss": 0.0997, "num_input_tokens_seen": 3560776, "step": 18675 }, { "epoch": 9.70893970893971, "grad_norm": 0.9454622268676758, "learning_rate": 3.0587972917755695e-05, "loss": 0.0928, "num_input_tokens_seen": 3561672, "step": 18680 }, { "epoch": 9.711538461538462, "grad_norm": 0.9992848038673401, "learning_rate": 3.0576920032344966e-05, "loss": 0.1798, "num_input_tokens_seen": 3562600, "step": 18685 }, { "epoch": 9.714137214137214, "grad_norm": 1.2887386083602905, "learning_rate": 3.056586599962889e-05, "loss": 0.0918, "num_input_tokens_seen": 3563496, "step": 18690 }, { "epoch": 9.716735966735968, "grad_norm": 0.6512187123298645, "learning_rate": 3.055481082188154e-05, "loss": 0.224, "num_input_tokens_seen": 3564488, "step": 18695 }, { "epoch": 9.71933471933472, "grad_norm": 0.3431296646595001, "learning_rate": 3.0543754501377244e-05, "loss": 0.1461, "num_input_tokens_seen": 3565416, "step": 18700 }, { "epoch": 9.721933471933472, "grad_norm": 0.517914354801178, "learning_rate": 3.0532697040390534e-05, "loss": 0.1682, "num_input_tokens_seen": 3566344, "step": 18705 }, { "epoch": 9.724532224532224, "grad_norm": 0.7718478441238403, "learning_rate": 3.052163844119619e-05, "loss": 0.1595, "num_input_tokens_seen": 3567304, "step": 18710 }, { "epoch": 9.727130977130978, "grad_norm": 1.1091563701629639, "learning_rate": 3.051057870606925e-05, "loss": 0.2681, "num_input_tokens_seen": 3568264, "step": 18715 }, { "epoch": 9.72972972972973, "grad_norm": 0.9986427426338196, "learning_rate": 3.0499517837284948e-05, "loss": 0.1334, "num_input_tokens_seen": 3569192, "step": 18720 }, { "epoch": 9.732328482328482, "grad_norm": 1.976649284362793, "learning_rate": 3.048845583711878e-05, "loss": 0.1197, "num_input_tokens_seen": 3570184, "step": 18725 }, { "epoch": 9.734927234927234, "grad_norm": 0.08348076790571213, "learning_rate": 3.0477392707846452e-05, "loss": 0.096, "num_input_tokens_seen": 3571176, "step": 18730 }, { "epoch": 9.737525987525988, "grad_norm": 0.12867316603660583, "learning_rate": 3.0466328451743914e-05, "loss": 0.1074, "num_input_tokens_seen": 3572104, "step": 18735 }, { "epoch": 9.74012474012474, "grad_norm": 0.6753714680671692, "learning_rate": 3.0455263071087354e-05, "loss": 0.1874, "num_input_tokens_seen": 3573064, "step": 18740 }, { "epoch": 9.742723492723492, "grad_norm": 0.6511898040771484, "learning_rate": 3.044419656815318e-05, "loss": 0.1404, "num_input_tokens_seen": 3574056, "step": 18745 }, { "epoch": 9.745322245322246, "grad_norm": 0.6916228532791138, "learning_rate": 3.0433128945218044e-05, "loss": 0.1332, "num_input_tokens_seen": 3574952, "step": 18750 }, { "epoch": 9.747920997920998, "grad_norm": 0.7135441899299622, "learning_rate": 3.04220602045588e-05, "loss": 0.2055, "num_input_tokens_seen": 3575912, "step": 18755 }, { "epoch": 9.75051975051975, "grad_norm": 0.4934011995792389, "learning_rate": 3.0410990348452573e-05, "loss": 0.1486, "num_input_tokens_seen": 3576936, "step": 18760 }, { "epoch": 9.753118503118504, "grad_norm": 0.790336549282074, "learning_rate": 3.039991937917669e-05, "loss": 0.1815, "num_input_tokens_seen": 3577832, "step": 18765 }, { "epoch": 9.755717255717256, "grad_norm": 0.9814446568489075, "learning_rate": 3.0388847299008703e-05, "loss": 0.2099, "num_input_tokens_seen": 3578792, "step": 18770 }, { "epoch": 9.758316008316008, "grad_norm": 0.08595093339681625, "learning_rate": 3.037777411022642e-05, "loss": 0.0516, "num_input_tokens_seen": 3579784, "step": 18775 }, { "epoch": 9.76091476091476, "grad_norm": 1.029219150543213, "learning_rate": 3.036669981510784e-05, "loss": 0.1727, "num_input_tokens_seen": 3580808, "step": 18780 }, { "epoch": 9.763513513513514, "grad_norm": 0.22200503945350647, "learning_rate": 3.0355624415931226e-05, "loss": 0.1511, "num_input_tokens_seen": 3581704, "step": 18785 }, { "epoch": 9.766112266112266, "grad_norm": 0.32079169154167175, "learning_rate": 3.034454791497504e-05, "loss": 0.1547, "num_input_tokens_seen": 3582728, "step": 18790 }, { "epoch": 9.768711018711018, "grad_norm": 0.6581041812896729, "learning_rate": 3.0333470314518e-05, "loss": 0.1617, "num_input_tokens_seen": 3583624, "step": 18795 }, { "epoch": 9.771309771309772, "grad_norm": 0.37988677620887756, "learning_rate": 3.0322391616839006e-05, "loss": 0.1614, "num_input_tokens_seen": 3584648, "step": 18800 }, { "epoch": 9.773908523908524, "grad_norm": 0.39055827260017395, "learning_rate": 3.0311311824217236e-05, "loss": 0.1017, "num_input_tokens_seen": 3585576, "step": 18805 }, { "epoch": 9.776507276507276, "grad_norm": 0.6279420256614685, "learning_rate": 3.0300230938932045e-05, "loss": 0.197, "num_input_tokens_seen": 3586504, "step": 18810 }, { "epoch": 9.779106029106028, "grad_norm": 0.7916146516799927, "learning_rate": 3.0289148963263048e-05, "loss": 0.189, "num_input_tokens_seen": 3587496, "step": 18815 }, { "epoch": 9.781704781704782, "grad_norm": 0.2038624882698059, "learning_rate": 3.0278065899490072e-05, "loss": 0.1267, "num_input_tokens_seen": 3588456, "step": 18820 }, { "epoch": 9.784303534303534, "grad_norm": 0.4631391763687134, "learning_rate": 3.0266981749893157e-05, "loss": 0.161, "num_input_tokens_seen": 3589448, "step": 18825 }, { "epoch": 9.786902286902286, "grad_norm": 0.1040368601679802, "learning_rate": 3.0255896516752597e-05, "loss": 0.0583, "num_input_tokens_seen": 3590344, "step": 18830 }, { "epoch": 9.78950103950104, "grad_norm": 0.6345663070678711, "learning_rate": 3.0244810202348866e-05, "loss": 0.066, "num_input_tokens_seen": 3591240, "step": 18835 }, { "epoch": 9.792099792099792, "grad_norm": 0.4694373607635498, "learning_rate": 3.0233722808962694e-05, "loss": 0.1712, "num_input_tokens_seen": 3592232, "step": 18840 }, { "epoch": 9.794698544698544, "grad_norm": 1.2532904148101807, "learning_rate": 3.0222634338875023e-05, "loss": 0.1181, "num_input_tokens_seen": 3593192, "step": 18845 }, { "epoch": 9.797297297297296, "grad_norm": 0.5597095489501953, "learning_rate": 3.0211544794367008e-05, "loss": 0.1008, "num_input_tokens_seen": 3594120, "step": 18850 }, { "epoch": 9.79989604989605, "grad_norm": 0.19523625075817108, "learning_rate": 3.020045417772004e-05, "loss": 0.1019, "num_input_tokens_seen": 3595080, "step": 18855 }, { "epoch": 9.802494802494802, "grad_norm": 0.6436668038368225, "learning_rate": 3.018936249121573e-05, "loss": 0.0752, "num_input_tokens_seen": 3595944, "step": 18860 }, { "epoch": 9.805093555093555, "grad_norm": 0.11638372391462326, "learning_rate": 3.0178269737135885e-05, "loss": 0.0848, "num_input_tokens_seen": 3596840, "step": 18865 }, { "epoch": 9.807692307692308, "grad_norm": 0.04746755585074425, "learning_rate": 3.0167175917762553e-05, "loss": 0.0738, "num_input_tokens_seen": 3597864, "step": 18870 }, { "epoch": 9.81029106029106, "grad_norm": 0.868020236492157, "learning_rate": 3.015608103537801e-05, "loss": 0.154, "num_input_tokens_seen": 3598856, "step": 18875 }, { "epoch": 9.812889812889813, "grad_norm": 0.6765603423118591, "learning_rate": 3.0144985092264723e-05, "loss": 0.1073, "num_input_tokens_seen": 3599720, "step": 18880 }, { "epoch": 9.815488565488565, "grad_norm": 0.35290494561195374, "learning_rate": 3.0133888090705397e-05, "loss": 0.2512, "num_input_tokens_seen": 3600712, "step": 18885 }, { "epoch": 9.818087318087318, "grad_norm": 0.759772539138794, "learning_rate": 3.012279003298295e-05, "loss": 0.0809, "num_input_tokens_seen": 3601640, "step": 18890 }, { "epoch": 9.82068607068607, "grad_norm": 0.14368264377117157, "learning_rate": 3.0111690921380527e-05, "loss": 0.1388, "num_input_tokens_seen": 3602632, "step": 18895 }, { "epoch": 9.823284823284823, "grad_norm": 1.1518135070800781, "learning_rate": 3.010059075818146e-05, "loss": 0.1421, "num_input_tokens_seen": 3603624, "step": 18900 }, { "epoch": 9.825883575883577, "grad_norm": 0.1691375970840454, "learning_rate": 3.008948954566933e-05, "loss": 0.0337, "num_input_tokens_seen": 3604584, "step": 18905 }, { "epoch": 9.828482328482329, "grad_norm": 0.7502545118331909, "learning_rate": 3.0078387286127924e-05, "loss": 0.0853, "num_input_tokens_seen": 3605576, "step": 18910 }, { "epoch": 9.83108108108108, "grad_norm": 0.07884793728590012, "learning_rate": 3.0067283981841222e-05, "loss": 0.0793, "num_input_tokens_seen": 3606472, "step": 18915 }, { "epoch": 9.833679833679835, "grad_norm": 0.41198766231536865, "learning_rate": 3.0056179635093464e-05, "loss": 0.0946, "num_input_tokens_seen": 3607496, "step": 18920 }, { "epoch": 9.836278586278587, "grad_norm": 0.2810077369213104, "learning_rate": 3.004507424816906e-05, "loss": 0.0989, "num_input_tokens_seen": 3608392, "step": 18925 }, { "epoch": 9.838877338877339, "grad_norm": 0.3652256727218628, "learning_rate": 3.0033967823352664e-05, "loss": 0.1322, "num_input_tokens_seen": 3609384, "step": 18930 }, { "epoch": 9.84147609147609, "grad_norm": 0.8311475515365601, "learning_rate": 3.0022860362929122e-05, "loss": 0.0764, "num_input_tokens_seen": 3610376, "step": 18935 }, { "epoch": 9.844074844074845, "grad_norm": 0.09402421861886978, "learning_rate": 3.001175186918351e-05, "loss": 0.2085, "num_input_tokens_seen": 3611368, "step": 18940 }, { "epoch": 9.846673596673597, "grad_norm": 0.6446236968040466, "learning_rate": 3.0000642344401113e-05, "loss": 0.0915, "num_input_tokens_seen": 3612328, "step": 18945 }, { "epoch": 9.849272349272349, "grad_norm": 0.33992817997932434, "learning_rate": 2.9989531790867415e-05, "loss": 0.11, "num_input_tokens_seen": 3613288, "step": 18950 }, { "epoch": 9.851871101871101, "grad_norm": 0.2622523009777069, "learning_rate": 2.997842021086813e-05, "loss": 0.099, "num_input_tokens_seen": 3614280, "step": 18955 }, { "epoch": 9.854469854469855, "grad_norm": 0.6754992604255676, "learning_rate": 2.9967307606689177e-05, "loss": 0.1494, "num_input_tokens_seen": 3615240, "step": 18960 }, { "epoch": 9.857068607068607, "grad_norm": 0.36814549565315247, "learning_rate": 2.9956193980616677e-05, "loss": 0.106, "num_input_tokens_seen": 3616168, "step": 18965 }, { "epoch": 9.859667359667359, "grad_norm": 2.144777297973633, "learning_rate": 2.994507933493696e-05, "loss": 0.1749, "num_input_tokens_seen": 3617032, "step": 18970 }, { "epoch": 9.862266112266113, "grad_norm": 0.6788378953933716, "learning_rate": 2.9933963671936587e-05, "loss": 0.1562, "num_input_tokens_seen": 3618024, "step": 18975 }, { "epoch": 9.864864864864865, "grad_norm": 1.043016791343689, "learning_rate": 2.992284699390231e-05, "loss": 0.0904, "num_input_tokens_seen": 3618920, "step": 18980 }, { "epoch": 9.867463617463617, "grad_norm": 0.35816752910614014, "learning_rate": 2.99117293031211e-05, "loss": 0.2209, "num_input_tokens_seen": 3619848, "step": 18985 }, { "epoch": 9.87006237006237, "grad_norm": 0.14041057229042053, "learning_rate": 2.990061060188012e-05, "loss": 0.1069, "num_input_tokens_seen": 3620744, "step": 18990 }, { "epoch": 9.872661122661123, "grad_norm": 0.7998389601707458, "learning_rate": 2.9889490892466766e-05, "loss": 0.1375, "num_input_tokens_seen": 3621608, "step": 18995 }, { "epoch": 9.875259875259875, "grad_norm": 1.604931116104126, "learning_rate": 2.9878370177168613e-05, "loss": 0.2941, "num_input_tokens_seen": 3622504, "step": 19000 }, { "epoch": 9.877858627858627, "grad_norm": 1.1231194734573364, "learning_rate": 2.986724845827346e-05, "loss": 0.1783, "num_input_tokens_seen": 3623496, "step": 19005 }, { "epoch": 9.880457380457381, "grad_norm": 0.8781957030296326, "learning_rate": 2.985612573806932e-05, "loss": 0.1738, "num_input_tokens_seen": 3624392, "step": 19010 }, { "epoch": 9.883056133056133, "grad_norm": 1.1564644575119019, "learning_rate": 2.9845002018844388e-05, "loss": 0.1824, "num_input_tokens_seen": 3625384, "step": 19015 }, { "epoch": 9.885654885654885, "grad_norm": 1.1528147459030151, "learning_rate": 2.9833877302887086e-05, "loss": 0.1661, "num_input_tokens_seen": 3626376, "step": 19020 }, { "epoch": 9.888253638253639, "grad_norm": 0.5626353025436401, "learning_rate": 2.9822751592486036e-05, "loss": 0.2249, "num_input_tokens_seen": 3627432, "step": 19025 }, { "epoch": 9.890852390852391, "grad_norm": 0.7983288168907166, "learning_rate": 2.981162488993005e-05, "loss": 0.0531, "num_input_tokens_seen": 3628456, "step": 19030 }, { "epoch": 9.893451143451143, "grad_norm": 0.5183022022247314, "learning_rate": 2.980049719750817e-05, "loss": 0.0787, "num_input_tokens_seen": 3629416, "step": 19035 }, { "epoch": 9.896049896049895, "grad_norm": 0.6900389194488525, "learning_rate": 2.9789368517509615e-05, "loss": 0.1931, "num_input_tokens_seen": 3630344, "step": 19040 }, { "epoch": 9.89864864864865, "grad_norm": 0.716482937335968, "learning_rate": 2.9778238852223832e-05, "loss": 0.162, "num_input_tokens_seen": 3631304, "step": 19045 }, { "epoch": 9.901247401247401, "grad_norm": 0.848336398601532, "learning_rate": 2.9767108203940448e-05, "loss": 0.1169, "num_input_tokens_seen": 3632232, "step": 19050 }, { "epoch": 9.903846153846153, "grad_norm": 0.6776134967803955, "learning_rate": 2.9755976574949307e-05, "loss": 0.0839, "num_input_tokens_seen": 3633096, "step": 19055 }, { "epoch": 9.906444906444907, "grad_norm": 0.37039992213249207, "learning_rate": 2.9744843967540455e-05, "loss": 0.0378, "num_input_tokens_seen": 3634024, "step": 19060 }, { "epoch": 9.90904365904366, "grad_norm": 1.073974847793579, "learning_rate": 2.973371038400412e-05, "loss": 0.0979, "num_input_tokens_seen": 3634952, "step": 19065 }, { "epoch": 9.911642411642411, "grad_norm": 0.34920257329940796, "learning_rate": 2.972257582663076e-05, "loss": 0.1005, "num_input_tokens_seen": 3635944, "step": 19070 }, { "epoch": 9.914241164241163, "grad_norm": 0.5770790576934814, "learning_rate": 2.9711440297711008e-05, "loss": 0.0885, "num_input_tokens_seen": 3636872, "step": 19075 }, { "epoch": 9.916839916839917, "grad_norm": 0.5857679843902588, "learning_rate": 2.9700303799535723e-05, "loss": 0.1155, "num_input_tokens_seen": 3637800, "step": 19080 }, { "epoch": 9.91943866943867, "grad_norm": 0.8106997609138489, "learning_rate": 2.968916633439593e-05, "loss": 0.1917, "num_input_tokens_seen": 3638760, "step": 19085 }, { "epoch": 9.922037422037421, "grad_norm": 0.18883095681667328, "learning_rate": 2.967802790458289e-05, "loss": 0.0589, "num_input_tokens_seen": 3639784, "step": 19090 }, { "epoch": 9.924636174636175, "grad_norm": 0.913567066192627, "learning_rate": 2.966688851238802e-05, "loss": 0.0857, "num_input_tokens_seen": 3640648, "step": 19095 }, { "epoch": 9.927234927234927, "grad_norm": 0.11714102327823639, "learning_rate": 2.9655748160102982e-05, "loss": 0.0735, "num_input_tokens_seen": 3641608, "step": 19100 }, { "epoch": 9.92983367983368, "grad_norm": 0.7127164602279663, "learning_rate": 2.964460685001959e-05, "loss": 0.095, "num_input_tokens_seen": 3642568, "step": 19105 }, { "epoch": 9.932432432432432, "grad_norm": 0.5973227024078369, "learning_rate": 2.9633464584429892e-05, "loss": 0.1222, "num_input_tokens_seen": 3643528, "step": 19110 }, { "epoch": 9.935031185031185, "grad_norm": 0.9124866127967834, "learning_rate": 2.9622321365626115e-05, "loss": 0.2148, "num_input_tokens_seen": 3644520, "step": 19115 }, { "epoch": 9.937629937629938, "grad_norm": 0.513958215713501, "learning_rate": 2.9611177195900685e-05, "loss": 0.0735, "num_input_tokens_seen": 3645416, "step": 19120 }, { "epoch": 9.94022869022869, "grad_norm": 0.7369128465652466, "learning_rate": 2.9600032077546224e-05, "loss": 0.1015, "num_input_tokens_seen": 3646344, "step": 19125 }, { "epoch": 9.942827442827443, "grad_norm": 0.22983480989933014, "learning_rate": 2.9588886012855537e-05, "loss": 0.0828, "num_input_tokens_seen": 3647304, "step": 19130 }, { "epoch": 9.945426195426196, "grad_norm": 0.12018577009439468, "learning_rate": 2.9577739004121652e-05, "loss": 0.0664, "num_input_tokens_seen": 3648232, "step": 19135 }, { "epoch": 9.948024948024948, "grad_norm": 1.0292887687683105, "learning_rate": 2.956659105363776e-05, "loss": 0.1526, "num_input_tokens_seen": 3649160, "step": 19140 }, { "epoch": 9.950623700623701, "grad_norm": 0.46489450335502625, "learning_rate": 2.9555442163697272e-05, "loss": 0.1509, "num_input_tokens_seen": 3650184, "step": 19145 }, { "epoch": 9.953222453222454, "grad_norm": 0.9327641129493713, "learning_rate": 2.9544292336593772e-05, "loss": 0.1729, "num_input_tokens_seen": 3651144, "step": 19150 }, { "epoch": 9.955821205821206, "grad_norm": 0.6046791076660156, "learning_rate": 2.953314157462105e-05, "loss": 0.0906, "num_input_tokens_seen": 3652136, "step": 19155 }, { "epoch": 9.958419958419958, "grad_norm": 0.7027129530906677, "learning_rate": 2.9521989880073086e-05, "loss": 0.1285, "num_input_tokens_seen": 3653096, "step": 19160 }, { "epoch": 9.961018711018712, "grad_norm": 1.1228593587875366, "learning_rate": 2.9510837255244034e-05, "loss": 0.286, "num_input_tokens_seen": 3654088, "step": 19165 }, { "epoch": 9.963617463617464, "grad_norm": 1.301227331161499, "learning_rate": 2.949968370242827e-05, "loss": 0.1611, "num_input_tokens_seen": 3654984, "step": 19170 }, { "epoch": 9.966216216216216, "grad_norm": 0.43096569180488586, "learning_rate": 2.9488529223920342e-05, "loss": 0.0371, "num_input_tokens_seen": 3655976, "step": 19175 }, { "epoch": 9.96881496881497, "grad_norm": 2.406454563140869, "learning_rate": 2.9477373822014992e-05, "loss": 0.257, "num_input_tokens_seen": 3656936, "step": 19180 }, { "epoch": 9.971413721413722, "grad_norm": 0.31383630633354187, "learning_rate": 2.946621749900715e-05, "loss": 0.2282, "num_input_tokens_seen": 3657896, "step": 19185 }, { "epoch": 9.974012474012474, "grad_norm": 0.3659895062446594, "learning_rate": 2.9455060257191945e-05, "loss": 0.068, "num_input_tokens_seen": 3658824, "step": 19190 }, { "epoch": 9.976611226611226, "grad_norm": 0.18498453497886658, "learning_rate": 2.9443902098864672e-05, "loss": 0.1558, "num_input_tokens_seen": 3659848, "step": 19195 }, { "epoch": 9.97920997920998, "grad_norm": 0.6734769940376282, "learning_rate": 2.9432743026320842e-05, "loss": 0.0973, "num_input_tokens_seen": 3660808, "step": 19200 }, { "epoch": 9.981808731808732, "grad_norm": 0.9552810788154602, "learning_rate": 2.9421583041856136e-05, "loss": 0.1489, "num_input_tokens_seen": 3661768, "step": 19205 }, { "epoch": 9.984407484407484, "grad_norm": 0.2752419412136078, "learning_rate": 2.9410422147766438e-05, "loss": 0.1171, "num_input_tokens_seen": 3662696, "step": 19210 }, { "epoch": 9.987006237006238, "grad_norm": 0.7702627182006836, "learning_rate": 2.93992603463478e-05, "loss": 0.0816, "num_input_tokens_seen": 3663624, "step": 19215 }, { "epoch": 9.98960498960499, "grad_norm": 1.0621148347854614, "learning_rate": 2.9388097639896484e-05, "loss": 0.1986, "num_input_tokens_seen": 3664552, "step": 19220 }, { "epoch": 9.992203742203742, "grad_norm": 0.5921289920806885, "learning_rate": 2.9376934030708908e-05, "loss": 0.1496, "num_input_tokens_seen": 3665512, "step": 19225 }, { "epoch": 9.994802494802494, "grad_norm": 0.3195056915283203, "learning_rate": 2.9365769521081694e-05, "loss": 0.1329, "num_input_tokens_seen": 3666376, "step": 19230 }, { "epoch": 9.997401247401248, "grad_norm": 0.15362559258937836, "learning_rate": 2.9354604113311663e-05, "loss": 0.1068, "num_input_tokens_seen": 3667368, "step": 19235 }, { "epoch": 10.0, "grad_norm": 0.9028632044792175, "learning_rate": 2.9343437809695785e-05, "loss": 0.2095, "num_input_tokens_seen": 3668312, "step": 19240 }, { "epoch": 10.0, "eval_loss": 0.14727549254894257, "eval_runtime": 8.0304, "eval_samples_per_second": 106.595, "eval_steps_per_second": 26.649, "num_input_tokens_seen": 3668312, "step": 19240 }, { "epoch": 10.002598752598752, "grad_norm": 0.8984609842300415, "learning_rate": 2.9332270612531254e-05, "loss": 0.1466, "num_input_tokens_seen": 3669272, "step": 19245 }, { "epoch": 10.005197505197506, "grad_norm": 0.4613308012485504, "learning_rate": 2.9321102524115417e-05, "loss": 0.1022, "num_input_tokens_seen": 3670168, "step": 19250 }, { "epoch": 10.007796257796258, "grad_norm": 0.8428023457527161, "learning_rate": 2.930993354674582e-05, "loss": 0.1012, "num_input_tokens_seen": 3671128, "step": 19255 }, { "epoch": 10.01039501039501, "grad_norm": 1.1994976997375488, "learning_rate": 2.9298763682720194e-05, "loss": 0.2664, "num_input_tokens_seen": 3672056, "step": 19260 }, { "epoch": 10.012993762993762, "grad_norm": 0.7145564556121826, "learning_rate": 2.928759293433643e-05, "loss": 0.1182, "num_input_tokens_seen": 3672952, "step": 19265 }, { "epoch": 10.015592515592516, "grad_norm": 0.7592386603355408, "learning_rate": 2.9276421303892627e-05, "loss": 0.1184, "num_input_tokens_seen": 3673912, "step": 19270 }, { "epoch": 10.018191268191268, "grad_norm": 0.8141966462135315, "learning_rate": 2.926524879368705e-05, "loss": 0.1786, "num_input_tokens_seen": 3674904, "step": 19275 }, { "epoch": 10.02079002079002, "grad_norm": 1.4936710596084595, "learning_rate": 2.9254075406018165e-05, "loss": 0.1131, "num_input_tokens_seen": 3675800, "step": 19280 }, { "epoch": 10.023388773388774, "grad_norm": 0.5856519937515259, "learning_rate": 2.924290114318459e-05, "loss": 0.2336, "num_input_tokens_seen": 3676792, "step": 19285 }, { "epoch": 10.025987525987526, "grad_norm": 1.5703812837600708, "learning_rate": 2.9231726007485143e-05, "loss": 0.1558, "num_input_tokens_seen": 3677752, "step": 19290 }, { "epoch": 10.028586278586278, "grad_norm": 0.787590742111206, "learning_rate": 2.9220550001218815e-05, "loss": 0.1209, "num_input_tokens_seen": 3678680, "step": 19295 }, { "epoch": 10.03118503118503, "grad_norm": 0.1524621993303299, "learning_rate": 2.9209373126684773e-05, "loss": 0.2136, "num_input_tokens_seen": 3679608, "step": 19300 }, { "epoch": 10.033783783783784, "grad_norm": 0.5799196362495422, "learning_rate": 2.9198195386182376e-05, "loss": 0.135, "num_input_tokens_seen": 3680600, "step": 19305 }, { "epoch": 10.036382536382536, "grad_norm": 1.169182538986206, "learning_rate": 2.9187016782011135e-05, "loss": 0.1258, "num_input_tokens_seen": 3681496, "step": 19310 }, { "epoch": 10.038981288981288, "grad_norm": 0.485152930021286, "learning_rate": 2.917583731647077e-05, "loss": 0.1845, "num_input_tokens_seen": 3682456, "step": 19315 }, { "epoch": 10.041580041580042, "grad_norm": 0.47893428802490234, "learning_rate": 2.9164656991861167e-05, "loss": 0.2092, "num_input_tokens_seen": 3683320, "step": 19320 }, { "epoch": 10.044178794178794, "grad_norm": 0.21849490702152252, "learning_rate": 2.9153475810482367e-05, "loss": 0.075, "num_input_tokens_seen": 3684280, "step": 19325 }, { "epoch": 10.046777546777546, "grad_norm": 0.17575190961360931, "learning_rate": 2.914229377463462e-05, "loss": 0.1686, "num_input_tokens_seen": 3685304, "step": 19330 }, { "epoch": 10.049376299376299, "grad_norm": 0.3559441864490509, "learning_rate": 2.9131110886618328e-05, "loss": 0.08, "num_input_tokens_seen": 3686264, "step": 19335 }, { "epoch": 10.051975051975052, "grad_norm": 1.045626163482666, "learning_rate": 2.911992714873409e-05, "loss": 0.1088, "num_input_tokens_seen": 3687224, "step": 19340 }, { "epoch": 10.054573804573804, "grad_norm": 2.2457118034362793, "learning_rate": 2.9108742563282652e-05, "loss": 0.1531, "num_input_tokens_seen": 3688216, "step": 19345 }, { "epoch": 10.057172557172557, "grad_norm": 0.4300983250141144, "learning_rate": 2.9097557132564962e-05, "loss": 0.1626, "num_input_tokens_seen": 3689208, "step": 19350 }, { "epoch": 10.05977130977131, "grad_norm": 0.2466953694820404, "learning_rate": 2.908637085888213e-05, "loss": 0.1267, "num_input_tokens_seen": 3690168, "step": 19355 }, { "epoch": 10.062370062370062, "grad_norm": 0.24728107452392578, "learning_rate": 2.9075183744535428e-05, "loss": 0.1562, "num_input_tokens_seen": 3691128, "step": 19360 }, { "epoch": 10.064968814968815, "grad_norm": 0.576466977596283, "learning_rate": 2.9063995791826316e-05, "loss": 0.231, "num_input_tokens_seen": 3692184, "step": 19365 }, { "epoch": 10.067567567567568, "grad_norm": 0.5510562658309937, "learning_rate": 2.905280700305643e-05, "loss": 0.1361, "num_input_tokens_seen": 3693112, "step": 19370 }, { "epoch": 10.07016632016632, "grad_norm": 1.0158284902572632, "learning_rate": 2.9041617380527562e-05, "loss": 0.1108, "num_input_tokens_seen": 3694072, "step": 19375 }, { "epoch": 10.072765072765073, "grad_norm": 0.7957232594490051, "learning_rate": 2.9030426926541687e-05, "loss": 0.2101, "num_input_tokens_seen": 3695000, "step": 19380 }, { "epoch": 10.075363825363825, "grad_norm": 0.5429555773735046, "learning_rate": 2.9019235643400955e-05, "loss": 0.1365, "num_input_tokens_seen": 3695896, "step": 19385 }, { "epoch": 10.077962577962579, "grad_norm": 0.8897971510887146, "learning_rate": 2.9008043533407664e-05, "loss": 0.1385, "num_input_tokens_seen": 3696856, "step": 19390 }, { "epoch": 10.08056133056133, "grad_norm": 0.8767789006233215, "learning_rate": 2.899685059886431e-05, "loss": 0.1262, "num_input_tokens_seen": 3697848, "step": 19395 }, { "epoch": 10.083160083160083, "grad_norm": 0.5606285333633423, "learning_rate": 2.8985656842073545e-05, "loss": 0.1164, "num_input_tokens_seen": 3698840, "step": 19400 }, { "epoch": 10.085758835758837, "grad_norm": 0.12156257033348083, "learning_rate": 2.8974462265338188e-05, "loss": 0.037, "num_input_tokens_seen": 3699800, "step": 19405 }, { "epoch": 10.088357588357589, "grad_norm": 0.5988990664482117, "learning_rate": 2.8963266870961226e-05, "loss": 0.2826, "num_input_tokens_seen": 3700856, "step": 19410 }, { "epoch": 10.09095634095634, "grad_norm": 0.9974862337112427, "learning_rate": 2.8952070661245833e-05, "loss": 0.0843, "num_input_tokens_seen": 3701784, "step": 19415 }, { "epoch": 10.093555093555093, "grad_norm": 0.20621450245380402, "learning_rate": 2.8940873638495325e-05, "loss": 0.1814, "num_input_tokens_seen": 3702776, "step": 19420 }, { "epoch": 10.096153846153847, "grad_norm": 0.553715705871582, "learning_rate": 2.8929675805013195e-05, "loss": 0.1784, "num_input_tokens_seen": 3703704, "step": 19425 }, { "epoch": 10.098752598752599, "grad_norm": 0.40555107593536377, "learning_rate": 2.8918477163103107e-05, "loss": 0.2155, "num_input_tokens_seen": 3704728, "step": 19430 }, { "epoch": 10.10135135135135, "grad_norm": 0.5502016544342041, "learning_rate": 2.890727771506889e-05, "loss": 0.0979, "num_input_tokens_seen": 3705656, "step": 19435 }, { "epoch": 10.103950103950105, "grad_norm": 0.8024094700813293, "learning_rate": 2.8896077463214543e-05, "loss": 0.1165, "num_input_tokens_seen": 3706648, "step": 19440 }, { "epoch": 10.106548856548857, "grad_norm": 0.3382473886013031, "learning_rate": 2.8884876409844213e-05, "loss": 0.0891, "num_input_tokens_seen": 3707608, "step": 19445 }, { "epoch": 10.109147609147609, "grad_norm": 0.20847350358963013, "learning_rate": 2.887367455726223e-05, "loss": 0.1034, "num_input_tokens_seen": 3708664, "step": 19450 }, { "epoch": 10.111746361746361, "grad_norm": 1.231932282447815, "learning_rate": 2.8862471907773088e-05, "loss": 0.1654, "num_input_tokens_seen": 3709592, "step": 19455 }, { "epoch": 10.114345114345115, "grad_norm": 0.6319548487663269, "learning_rate": 2.885126846368142e-05, "loss": 0.1661, "num_input_tokens_seen": 3710520, "step": 19460 }, { "epoch": 10.116943866943867, "grad_norm": 0.5776629447937012, "learning_rate": 2.8840064227292062e-05, "loss": 0.1833, "num_input_tokens_seen": 3711448, "step": 19465 }, { "epoch": 10.119542619542619, "grad_norm": 0.3430441617965698, "learning_rate": 2.8828859200909973e-05, "loss": 0.2237, "num_input_tokens_seen": 3712376, "step": 19470 }, { "epoch": 10.122141372141373, "grad_norm": 0.23850327730178833, "learning_rate": 2.881765338684032e-05, "loss": 0.2076, "num_input_tokens_seen": 3713400, "step": 19475 }, { "epoch": 10.124740124740125, "grad_norm": 0.6076775193214417, "learning_rate": 2.8806446787388382e-05, "loss": 0.1583, "num_input_tokens_seen": 3714424, "step": 19480 }, { "epoch": 10.127338877338877, "grad_norm": 0.40789660811424255, "learning_rate": 2.8795239404859636e-05, "loss": 0.2045, "num_input_tokens_seen": 3715448, "step": 19485 }, { "epoch": 10.12993762993763, "grad_norm": 0.5702039003372192, "learning_rate": 2.87840312415597e-05, "loss": 0.1135, "num_input_tokens_seen": 3716408, "step": 19490 }, { "epoch": 10.132536382536383, "grad_norm": 1.0225639343261719, "learning_rate": 2.8772822299794366e-05, "loss": 0.1096, "num_input_tokens_seen": 3717464, "step": 19495 }, { "epoch": 10.135135135135135, "grad_norm": 0.13108274340629578, "learning_rate": 2.876161258186958e-05, "loss": 0.1943, "num_input_tokens_seen": 3718424, "step": 19500 }, { "epoch": 10.137733887733887, "grad_norm": 0.6764465570449829, "learning_rate": 2.8750402090091438e-05, "loss": 0.0989, "num_input_tokens_seen": 3719352, "step": 19505 }, { "epoch": 10.140332640332641, "grad_norm": 0.11017978191375732, "learning_rate": 2.8739190826766227e-05, "loss": 0.1052, "num_input_tokens_seen": 3720216, "step": 19510 }, { "epoch": 10.142931392931393, "grad_norm": 0.4299580752849579, "learning_rate": 2.8727978794200355e-05, "loss": 0.1231, "num_input_tokens_seen": 3721240, "step": 19515 }, { "epoch": 10.145530145530145, "grad_norm": 0.5678550601005554, "learning_rate": 2.8716765994700406e-05, "loss": 0.1015, "num_input_tokens_seen": 3722200, "step": 19520 }, { "epoch": 10.148128898128897, "grad_norm": 0.28524914383888245, "learning_rate": 2.870555243057312e-05, "loss": 0.103, "num_input_tokens_seen": 3723096, "step": 19525 }, { "epoch": 10.150727650727651, "grad_norm": 0.2844633162021637, "learning_rate": 2.8694338104125396e-05, "loss": 0.0827, "num_input_tokens_seen": 3724024, "step": 19530 }, { "epoch": 10.153326403326403, "grad_norm": 0.05002322420477867, "learning_rate": 2.8683123017664282e-05, "loss": 0.144, "num_input_tokens_seen": 3724920, "step": 19535 }, { "epoch": 10.155925155925155, "grad_norm": 0.18911099433898926, "learning_rate": 2.8671907173497005e-05, "loss": 0.0843, "num_input_tokens_seen": 3725848, "step": 19540 }, { "epoch": 10.15852390852391, "grad_norm": 0.17686934769153595, "learning_rate": 2.8660690573930915e-05, "loss": 0.0655, "num_input_tokens_seen": 3726840, "step": 19545 }, { "epoch": 10.161122661122661, "grad_norm": 0.6859423518180847, "learning_rate": 2.8649473221273544e-05, "loss": 0.1932, "num_input_tokens_seen": 3727832, "step": 19550 }, { "epoch": 10.163721413721413, "grad_norm": 1.0338534116744995, "learning_rate": 2.8638255117832563e-05, "loss": 0.2711, "num_input_tokens_seen": 3728760, "step": 19555 }, { "epoch": 10.166320166320165, "grad_norm": 1.0692311525344849, "learning_rate": 2.8627036265915803e-05, "loss": 0.057, "num_input_tokens_seen": 3729720, "step": 19560 }, { "epoch": 10.16891891891892, "grad_norm": 0.7690746784210205, "learning_rate": 2.8615816667831254e-05, "loss": 0.0505, "num_input_tokens_seen": 3730648, "step": 19565 }, { "epoch": 10.171517671517671, "grad_norm": 0.14423871040344238, "learning_rate": 2.8604596325887046e-05, "loss": 0.16, "num_input_tokens_seen": 3731512, "step": 19570 }, { "epoch": 10.174116424116423, "grad_norm": 0.4047400951385498, "learning_rate": 2.859337524239148e-05, "loss": 0.0567, "num_input_tokens_seen": 3732472, "step": 19575 }, { "epoch": 10.176715176715177, "grad_norm": 0.8640027642250061, "learning_rate": 2.8582153419652997e-05, "loss": 0.2828, "num_input_tokens_seen": 3733464, "step": 19580 }, { "epoch": 10.17931392931393, "grad_norm": 0.2846801280975342, "learning_rate": 2.8570930859980188e-05, "loss": 0.0742, "num_input_tokens_seen": 3734392, "step": 19585 }, { "epoch": 10.181912681912682, "grad_norm": 0.4705115258693695, "learning_rate": 2.8559707565681816e-05, "loss": 0.1179, "num_input_tokens_seen": 3735352, "step": 19590 }, { "epoch": 10.184511434511435, "grad_norm": 1.1928107738494873, "learning_rate": 2.8548483539066763e-05, "loss": 0.1729, "num_input_tokens_seen": 3736312, "step": 19595 }, { "epoch": 10.187110187110187, "grad_norm": 0.8496614098548889, "learning_rate": 2.8537258782444087e-05, "loss": 0.1522, "num_input_tokens_seen": 3737208, "step": 19600 }, { "epoch": 10.18970893970894, "grad_norm": 0.36451953649520874, "learning_rate": 2.8526033298122985e-05, "loss": 0.0526, "num_input_tokens_seen": 3738136, "step": 19605 }, { "epoch": 10.192307692307692, "grad_norm": 0.46996140480041504, "learning_rate": 2.851480708841281e-05, "loss": 0.1444, "num_input_tokens_seen": 3739096, "step": 19610 }, { "epoch": 10.194906444906445, "grad_norm": 0.21652135252952576, "learning_rate": 2.850358015562306e-05, "loss": 0.041, "num_input_tokens_seen": 3739992, "step": 19615 }, { "epoch": 10.197505197505198, "grad_norm": 0.19729016721248627, "learning_rate": 2.8492352502063384e-05, "loss": 0.1168, "num_input_tokens_seen": 3740984, "step": 19620 }, { "epoch": 10.20010395010395, "grad_norm": 0.22947165369987488, "learning_rate": 2.848112413004357e-05, "loss": 0.1339, "num_input_tokens_seen": 3741912, "step": 19625 }, { "epoch": 10.202702702702704, "grad_norm": 0.67360919713974, "learning_rate": 2.8469895041873568e-05, "loss": 0.1433, "num_input_tokens_seen": 3742968, "step": 19630 }, { "epoch": 10.205301455301456, "grad_norm": 0.9870694279670715, "learning_rate": 2.8458665239863468e-05, "loss": 0.1327, "num_input_tokens_seen": 3743928, "step": 19635 }, { "epoch": 10.207900207900208, "grad_norm": 0.18928532302379608, "learning_rate": 2.8447434726323513e-05, "loss": 0.1073, "num_input_tokens_seen": 3744888, "step": 19640 }, { "epoch": 10.21049896049896, "grad_norm": 0.7659563422203064, "learning_rate": 2.8436203503564084e-05, "loss": 0.156, "num_input_tokens_seen": 3745784, "step": 19645 }, { "epoch": 10.213097713097714, "grad_norm": 0.20402920246124268, "learning_rate": 2.8424971573895705e-05, "loss": 0.122, "num_input_tokens_seen": 3746744, "step": 19650 }, { "epoch": 10.215696465696466, "grad_norm": 0.38360893726348877, "learning_rate": 2.841373893962906e-05, "loss": 0.1947, "num_input_tokens_seen": 3747640, "step": 19655 }, { "epoch": 10.218295218295218, "grad_norm": 0.7143480181694031, "learning_rate": 2.8402505603074958e-05, "loss": 0.1711, "num_input_tokens_seen": 3748600, "step": 19660 }, { "epoch": 10.220893970893972, "grad_norm": 0.9041780233383179, "learning_rate": 2.839127156654438e-05, "loss": 0.1345, "num_input_tokens_seen": 3749496, "step": 19665 }, { "epoch": 10.223492723492724, "grad_norm": 0.2112123966217041, "learning_rate": 2.8380036832348423e-05, "loss": 0.0948, "num_input_tokens_seen": 3750392, "step": 19670 }, { "epoch": 10.226091476091476, "grad_norm": 1.1508194208145142, "learning_rate": 2.8368801402798344e-05, "loss": 0.1962, "num_input_tokens_seen": 3751352, "step": 19675 }, { "epoch": 10.228690228690228, "grad_norm": 0.7675444483757019, "learning_rate": 2.8357565280205538e-05, "loss": 0.1025, "num_input_tokens_seen": 3752312, "step": 19680 }, { "epoch": 10.231288981288982, "grad_norm": 1.0842320919036865, "learning_rate": 2.8346328466881545e-05, "loss": 0.1019, "num_input_tokens_seen": 3753304, "step": 19685 }, { "epoch": 10.233887733887734, "grad_norm": 0.5642638206481934, "learning_rate": 2.833509096513804e-05, "loss": 0.1135, "num_input_tokens_seen": 3754296, "step": 19690 }, { "epoch": 10.236486486486486, "grad_norm": 0.8197750449180603, "learning_rate": 2.8323852777286845e-05, "loss": 0.2307, "num_input_tokens_seen": 3755192, "step": 19695 }, { "epoch": 10.23908523908524, "grad_norm": 0.7691067457199097, "learning_rate": 2.831261390563994e-05, "loss": 0.2037, "num_input_tokens_seen": 3756184, "step": 19700 }, { "epoch": 10.241683991683992, "grad_norm": 0.5055683851242065, "learning_rate": 2.8301374352509402e-05, "loss": 0.1473, "num_input_tokens_seen": 3757272, "step": 19705 }, { "epoch": 10.244282744282744, "grad_norm": 0.5828306078910828, "learning_rate": 2.8290134120207496e-05, "loss": 0.0825, "num_input_tokens_seen": 3758136, "step": 19710 }, { "epoch": 10.246881496881496, "grad_norm": 0.39882904291152954, "learning_rate": 2.8278893211046592e-05, "loss": 0.1397, "num_input_tokens_seen": 3759096, "step": 19715 }, { "epoch": 10.24948024948025, "grad_norm": 0.38459450006484985, "learning_rate": 2.826765162733922e-05, "loss": 0.249, "num_input_tokens_seen": 3760056, "step": 19720 }, { "epoch": 10.252079002079002, "grad_norm": 0.7564610242843628, "learning_rate": 2.825640937139804e-05, "loss": 0.0881, "num_input_tokens_seen": 3761016, "step": 19725 }, { "epoch": 10.254677754677754, "grad_norm": 0.23730719089508057, "learning_rate": 2.8245166445535853e-05, "loss": 0.05, "num_input_tokens_seen": 3761912, "step": 19730 }, { "epoch": 10.257276507276508, "grad_norm": 1.7627604007720947, "learning_rate": 2.8233922852065598e-05, "loss": 0.1736, "num_input_tokens_seen": 3762872, "step": 19735 }, { "epoch": 10.25987525987526, "grad_norm": 1.3964481353759766, "learning_rate": 2.8222678593300345e-05, "loss": 0.1883, "num_input_tokens_seen": 3763800, "step": 19740 }, { "epoch": 10.262474012474012, "grad_norm": 0.9569007754325867, "learning_rate": 2.8211433671553317e-05, "loss": 0.071, "num_input_tokens_seen": 3764760, "step": 19745 }, { "epoch": 10.265072765072764, "grad_norm": 0.21345633268356323, "learning_rate": 2.8200188089137846e-05, "loss": 0.1372, "num_input_tokens_seen": 3765688, "step": 19750 }, { "epoch": 10.267671517671518, "grad_norm": 0.25913575291633606, "learning_rate": 2.8188941848367423e-05, "loss": 0.1221, "num_input_tokens_seen": 3766648, "step": 19755 }, { "epoch": 10.27027027027027, "grad_norm": 0.7389880418777466, "learning_rate": 2.817769495155567e-05, "loss": 0.1236, "num_input_tokens_seen": 3767512, "step": 19760 }, { "epoch": 10.272869022869022, "grad_norm": 1.411280632019043, "learning_rate": 2.8166447401016345e-05, "loss": 0.1785, "num_input_tokens_seen": 3768536, "step": 19765 }, { "epoch": 10.275467775467776, "grad_norm": 1.2520920038223267, "learning_rate": 2.8155199199063332e-05, "loss": 0.2892, "num_input_tokens_seen": 3769528, "step": 19770 }, { "epoch": 10.278066528066528, "grad_norm": 0.1868458241224289, "learning_rate": 2.8143950348010656e-05, "loss": 0.1789, "num_input_tokens_seen": 3770456, "step": 19775 }, { "epoch": 10.28066528066528, "grad_norm": 0.5273217558860779, "learning_rate": 2.8132700850172477e-05, "loss": 0.2986, "num_input_tokens_seen": 3771384, "step": 19780 }, { "epoch": 10.283264033264032, "grad_norm": 0.11456387490034103, "learning_rate": 2.8121450707863074e-05, "loss": 0.08, "num_input_tokens_seen": 3772344, "step": 19785 }, { "epoch": 10.285862785862786, "grad_norm": 0.5373760461807251, "learning_rate": 2.811019992339688e-05, "loss": 0.1499, "num_input_tokens_seen": 3773272, "step": 19790 }, { "epoch": 10.288461538461538, "grad_norm": 0.5355638265609741, "learning_rate": 2.8098948499088446e-05, "loss": 0.0789, "num_input_tokens_seen": 3774264, "step": 19795 }, { "epoch": 10.29106029106029, "grad_norm": 0.13725467026233673, "learning_rate": 2.808769643725246e-05, "loss": 0.0535, "num_input_tokens_seen": 3775224, "step": 19800 }, { "epoch": 10.293659043659044, "grad_norm": 0.18004019558429718, "learning_rate": 2.8076443740203728e-05, "loss": 0.0434, "num_input_tokens_seen": 3776216, "step": 19805 }, { "epoch": 10.296257796257796, "grad_norm": 0.29292041063308716, "learning_rate": 2.8065190410257213e-05, "loss": 0.0746, "num_input_tokens_seen": 3777144, "step": 19810 }, { "epoch": 10.298856548856548, "grad_norm": 0.3475850224494934, "learning_rate": 2.805393644972799e-05, "loss": 0.207, "num_input_tokens_seen": 3778040, "step": 19815 }, { "epoch": 10.301455301455302, "grad_norm": 0.9223664999008179, "learning_rate": 2.8042681860931257e-05, "loss": 0.2373, "num_input_tokens_seen": 3778936, "step": 19820 }, { "epoch": 10.304054054054054, "grad_norm": 0.8324136734008789, "learning_rate": 2.803142664618237e-05, "loss": 0.1268, "num_input_tokens_seen": 3779800, "step": 19825 }, { "epoch": 10.306652806652806, "grad_norm": 0.12584517896175385, "learning_rate": 2.802017080779677e-05, "loss": 0.0688, "num_input_tokens_seen": 3780824, "step": 19830 }, { "epoch": 10.309251559251559, "grad_norm": 0.17198586463928223, "learning_rate": 2.800891434809007e-05, "loss": 0.0316, "num_input_tokens_seen": 3781784, "step": 19835 }, { "epoch": 10.311850311850312, "grad_norm": 0.3113061487674713, "learning_rate": 2.7997657269377982e-05, "loss": 0.093, "num_input_tokens_seen": 3782744, "step": 19840 }, { "epoch": 10.314449064449065, "grad_norm": 0.9197213649749756, "learning_rate": 2.798639957397637e-05, "loss": 0.0957, "num_input_tokens_seen": 3783704, "step": 19845 }, { "epoch": 10.317047817047817, "grad_norm": 0.19559933245182037, "learning_rate": 2.7975141264201192e-05, "loss": 0.1016, "num_input_tokens_seen": 3784632, "step": 19850 }, { "epoch": 10.31964656964657, "grad_norm": 0.14925934374332428, "learning_rate": 2.7963882342368558e-05, "loss": 0.0828, "num_input_tokens_seen": 3785624, "step": 19855 }, { "epoch": 10.322245322245323, "grad_norm": 1.070915699005127, "learning_rate": 2.79526228107947e-05, "loss": 0.1779, "num_input_tokens_seen": 3786520, "step": 19860 }, { "epoch": 10.324844074844075, "grad_norm": 0.3937858045101166, "learning_rate": 2.794136267179596e-05, "loss": 0.1745, "num_input_tokens_seen": 3787480, "step": 19865 }, { "epoch": 10.327442827442827, "grad_norm": 0.18951968848705292, "learning_rate": 2.793010192768884e-05, "loss": 0.0956, "num_input_tokens_seen": 3788504, "step": 19870 }, { "epoch": 10.33004158004158, "grad_norm": 0.4562492072582245, "learning_rate": 2.7918840580789923e-05, "loss": 0.0754, "num_input_tokens_seen": 3789464, "step": 19875 }, { "epoch": 10.332640332640333, "grad_norm": 0.4613192081451416, "learning_rate": 2.7907578633415944e-05, "loss": 0.0459, "num_input_tokens_seen": 3790424, "step": 19880 }, { "epoch": 10.335239085239085, "grad_norm": 0.9059663414955139, "learning_rate": 2.789631608788375e-05, "loss": 0.1957, "num_input_tokens_seen": 3791384, "step": 19885 }, { "epoch": 10.337837837837839, "grad_norm": 0.14904022216796875, "learning_rate": 2.7885052946510316e-05, "loss": 0.1007, "num_input_tokens_seen": 3792312, "step": 19890 }, { "epoch": 10.34043659043659, "grad_norm": 0.32372161746025085, "learning_rate": 2.7873789211612744e-05, "loss": 0.1499, "num_input_tokens_seen": 3793336, "step": 19895 }, { "epoch": 10.343035343035343, "grad_norm": 1.2410024404525757, "learning_rate": 2.7862524885508245e-05, "loss": 0.0868, "num_input_tokens_seen": 3794264, "step": 19900 }, { "epoch": 10.345634095634095, "grad_norm": 0.7496461272239685, "learning_rate": 2.785125997051417e-05, "loss": 0.0938, "num_input_tokens_seen": 3795192, "step": 19905 }, { "epoch": 10.348232848232849, "grad_norm": 0.37170976400375366, "learning_rate": 2.783999446894797e-05, "loss": 0.1867, "num_input_tokens_seen": 3796216, "step": 19910 }, { "epoch": 10.3508316008316, "grad_norm": 1.088952898979187, "learning_rate": 2.7828728383127232e-05, "loss": 0.1806, "num_input_tokens_seen": 3797208, "step": 19915 }, { "epoch": 10.353430353430353, "grad_norm": 0.5757919549942017, "learning_rate": 2.7817461715369654e-05, "loss": 0.0969, "num_input_tokens_seen": 3798104, "step": 19920 }, { "epoch": 10.356029106029107, "grad_norm": 0.25406503677368164, "learning_rate": 2.7806194467993064e-05, "loss": 0.0429, "num_input_tokens_seen": 3799032, "step": 19925 }, { "epoch": 10.358627858627859, "grad_norm": 0.25329676270484924, "learning_rate": 2.7794926643315396e-05, "loss": 0.1405, "num_input_tokens_seen": 3799896, "step": 19930 }, { "epoch": 10.361226611226611, "grad_norm": 0.20986081659793854, "learning_rate": 2.7783658243654714e-05, "loss": 0.1387, "num_input_tokens_seen": 3800856, "step": 19935 }, { "epoch": 10.363825363825363, "grad_norm": 0.5864109992980957, "learning_rate": 2.777238927132921e-05, "loss": 0.1092, "num_input_tokens_seen": 3801784, "step": 19940 }, { "epoch": 10.366424116424117, "grad_norm": 0.3344852328300476, "learning_rate": 2.7761119728657148e-05, "loss": 0.1495, "num_input_tokens_seen": 3802712, "step": 19945 }, { "epoch": 10.369022869022869, "grad_norm": 0.4297042787075043, "learning_rate": 2.7749849617956973e-05, "loss": 0.1025, "num_input_tokens_seen": 3803704, "step": 19950 }, { "epoch": 10.371621621621621, "grad_norm": 0.5909316539764404, "learning_rate": 2.7738578941547193e-05, "loss": 0.0546, "num_input_tokens_seen": 3804664, "step": 19955 }, { "epoch": 10.374220374220375, "grad_norm": 0.6598770022392273, "learning_rate": 2.7727307701746475e-05, "loss": 0.1166, "num_input_tokens_seen": 3805624, "step": 19960 }, { "epoch": 10.376819126819127, "grad_norm": 0.4711473882198334, "learning_rate": 2.7716035900873566e-05, "loss": 0.0954, "num_input_tokens_seen": 3806584, "step": 19965 }, { "epoch": 10.379417879417879, "grad_norm": 0.7850996255874634, "learning_rate": 2.7704763541247353e-05, "loss": 0.2055, "num_input_tokens_seen": 3807544, "step": 19970 }, { "epoch": 10.382016632016631, "grad_norm": 0.7125721573829651, "learning_rate": 2.7693490625186824e-05, "loss": 0.2057, "num_input_tokens_seen": 3808568, "step": 19975 }, { "epoch": 10.384615384615385, "grad_norm": 0.32766613364219666, "learning_rate": 2.7682217155011082e-05, "loss": 0.0684, "num_input_tokens_seen": 3809496, "step": 19980 }, { "epoch": 10.387214137214137, "grad_norm": 0.6606484055519104, "learning_rate": 2.7670943133039356e-05, "loss": 0.2263, "num_input_tokens_seen": 3810552, "step": 19985 }, { "epoch": 10.38981288981289, "grad_norm": 1.1779825687408447, "learning_rate": 2.765966856159098e-05, "loss": 0.1919, "num_input_tokens_seen": 3811480, "step": 19990 }, { "epoch": 10.392411642411643, "grad_norm": 0.1823272407054901, "learning_rate": 2.7648393442985403e-05, "loss": 0.117, "num_input_tokens_seen": 3812472, "step": 19995 }, { "epoch": 10.395010395010395, "grad_norm": 0.18828800320625305, "learning_rate": 2.7637117779542182e-05, "loss": 0.0711, "num_input_tokens_seen": 3813464, "step": 20000 }, { "epoch": 10.397609147609147, "grad_norm": 0.4050294756889343, "learning_rate": 2.762584157358099e-05, "loss": 0.0382, "num_input_tokens_seen": 3814328, "step": 20005 }, { "epoch": 10.4002079002079, "grad_norm": 0.7846800684928894, "learning_rate": 2.7614564827421617e-05, "loss": 0.2022, "num_input_tokens_seen": 3815320, "step": 20010 }, { "epoch": 10.402806652806653, "grad_norm": 0.10732147842645645, "learning_rate": 2.760328754338395e-05, "loss": 0.1006, "num_input_tokens_seen": 3816248, "step": 20015 }, { "epoch": 10.405405405405405, "grad_norm": 0.08946334570646286, "learning_rate": 2.7592009723788004e-05, "loss": 0.1418, "num_input_tokens_seen": 3817240, "step": 20020 }, { "epoch": 10.408004158004157, "grad_norm": 0.3895941376686096, "learning_rate": 2.758073137095388e-05, "loss": 0.1403, "num_input_tokens_seen": 3818168, "step": 20025 }, { "epoch": 10.410602910602911, "grad_norm": 0.28993767499923706, "learning_rate": 2.7569452487201824e-05, "loss": 0.1083, "num_input_tokens_seen": 3819128, "step": 20030 }, { "epoch": 10.413201663201663, "grad_norm": 1.0659747123718262, "learning_rate": 2.7558173074852155e-05, "loss": 0.1322, "num_input_tokens_seen": 3820024, "step": 20035 }, { "epoch": 10.415800415800415, "grad_norm": 0.3246299922466278, "learning_rate": 2.7546893136225334e-05, "loss": 0.0613, "num_input_tokens_seen": 3820952, "step": 20040 }, { "epoch": 10.41839916839917, "grad_norm": 0.52723228931427, "learning_rate": 2.7535612673641892e-05, "loss": 0.1434, "num_input_tokens_seen": 3821944, "step": 20045 }, { "epoch": 10.420997920997921, "grad_norm": 0.7823325991630554, "learning_rate": 2.7524331689422506e-05, "loss": 0.2288, "num_input_tokens_seen": 3822808, "step": 20050 }, { "epoch": 10.423596673596673, "grad_norm": 0.09202374517917633, "learning_rate": 2.751305018588793e-05, "loss": 0.0816, "num_input_tokens_seen": 3823832, "step": 20055 }, { "epoch": 10.426195426195425, "grad_norm": 0.20813338458538055, "learning_rate": 2.7501768165359043e-05, "loss": 0.1633, "num_input_tokens_seen": 3824824, "step": 20060 }, { "epoch": 10.42879417879418, "grad_norm": 0.59262615442276, "learning_rate": 2.749048563015684e-05, "loss": 0.1734, "num_input_tokens_seen": 3825816, "step": 20065 }, { "epoch": 10.431392931392931, "grad_norm": 0.12945668399333954, "learning_rate": 2.747920258260239e-05, "loss": 0.1445, "num_input_tokens_seen": 3826776, "step": 20070 }, { "epoch": 10.433991683991684, "grad_norm": 1.3650134801864624, "learning_rate": 2.7467919025016885e-05, "loss": 0.1467, "num_input_tokens_seen": 3827768, "step": 20075 }, { "epoch": 10.436590436590437, "grad_norm": 0.19241714477539062, "learning_rate": 2.745663495972162e-05, "loss": 0.1324, "num_input_tokens_seen": 3828728, "step": 20080 }, { "epoch": 10.43918918918919, "grad_norm": 0.9561806917190552, "learning_rate": 2.7445350389038018e-05, "loss": 0.1016, "num_input_tokens_seen": 3829752, "step": 20085 }, { "epoch": 10.441787941787942, "grad_norm": 0.12089741975069046, "learning_rate": 2.7434065315287553e-05, "loss": 0.0691, "num_input_tokens_seen": 3830776, "step": 20090 }, { "epoch": 10.444386694386694, "grad_norm": 0.38380166888237, "learning_rate": 2.7422779740791858e-05, "loss": 0.198, "num_input_tokens_seen": 3831800, "step": 20095 }, { "epoch": 10.446985446985448, "grad_norm": 0.9761490225791931, "learning_rate": 2.7411493667872635e-05, "loss": 0.1474, "num_input_tokens_seen": 3832792, "step": 20100 }, { "epoch": 10.4495841995842, "grad_norm": 0.06811904907226562, "learning_rate": 2.7400207098851698e-05, "loss": 0.0441, "num_input_tokens_seen": 3833784, "step": 20105 }, { "epoch": 10.452182952182952, "grad_norm": 0.4328056275844574, "learning_rate": 2.7388920036050965e-05, "loss": 0.1038, "num_input_tokens_seen": 3834744, "step": 20110 }, { "epoch": 10.454781704781706, "grad_norm": 0.4906936585903168, "learning_rate": 2.7377632481792447e-05, "loss": 0.0637, "num_input_tokens_seen": 3835672, "step": 20115 }, { "epoch": 10.457380457380458, "grad_norm": 0.1220477819442749, "learning_rate": 2.736634443839828e-05, "loss": 0.1529, "num_input_tokens_seen": 3836568, "step": 20120 }, { "epoch": 10.45997920997921, "grad_norm": 0.1689569056034088, "learning_rate": 2.735505590819066e-05, "loss": 0.0392, "num_input_tokens_seen": 3837464, "step": 20125 }, { "epoch": 10.462577962577962, "grad_norm": 0.750484049320221, "learning_rate": 2.7343766893491923e-05, "loss": 0.2208, "num_input_tokens_seen": 3838456, "step": 20130 }, { "epoch": 10.465176715176716, "grad_norm": 0.1641809344291687, "learning_rate": 2.733247739662448e-05, "loss": 0.0825, "num_input_tokens_seen": 3839384, "step": 20135 }, { "epoch": 10.467775467775468, "grad_norm": 0.18483708798885345, "learning_rate": 2.732118741991086e-05, "loss": 0.072, "num_input_tokens_seen": 3840408, "step": 20140 }, { "epoch": 10.47037422037422, "grad_norm": 0.9745309352874756, "learning_rate": 2.7309896965673677e-05, "loss": 0.0914, "num_input_tokens_seen": 3841304, "step": 20145 }, { "epoch": 10.472972972972974, "grad_norm": 1.4208178520202637, "learning_rate": 2.7298606036235635e-05, "loss": 0.1419, "num_input_tokens_seen": 3842232, "step": 20150 }, { "epoch": 10.475571725571726, "grad_norm": 0.12409214675426483, "learning_rate": 2.728731463391956e-05, "loss": 0.1255, "num_input_tokens_seen": 3843256, "step": 20155 }, { "epoch": 10.478170478170478, "grad_norm": 0.8163677453994751, "learning_rate": 2.7276022761048354e-05, "loss": 0.1314, "num_input_tokens_seen": 3844152, "step": 20160 }, { "epoch": 10.48076923076923, "grad_norm": 1.1587826013565063, "learning_rate": 2.7264730419945032e-05, "loss": 0.2851, "num_input_tokens_seen": 3845144, "step": 20165 }, { "epoch": 10.483367983367984, "grad_norm": 0.10403577983379364, "learning_rate": 2.72534376129327e-05, "loss": 0.1194, "num_input_tokens_seen": 3846072, "step": 20170 }, { "epoch": 10.485966735966736, "grad_norm": 0.7497982978820801, "learning_rate": 2.7242144342334547e-05, "loss": 0.0683, "num_input_tokens_seen": 3847032, "step": 20175 }, { "epoch": 10.488565488565488, "grad_norm": 1.0352516174316406, "learning_rate": 2.7230850610473867e-05, "loss": 0.1749, "num_input_tokens_seen": 3848088, "step": 20180 }, { "epoch": 10.491164241164242, "grad_norm": 0.8904209136962891, "learning_rate": 2.7219556419674057e-05, "loss": 0.1811, "num_input_tokens_seen": 3848984, "step": 20185 }, { "epoch": 10.493762993762994, "grad_norm": 0.678467869758606, "learning_rate": 2.72082617722586e-05, "loss": 0.0678, "num_input_tokens_seen": 3849912, "step": 20190 }, { "epoch": 10.496361746361746, "grad_norm": 0.2121157944202423, "learning_rate": 2.7196966670551076e-05, "loss": 0.0745, "num_input_tokens_seen": 3850872, "step": 20195 }, { "epoch": 10.4989604989605, "grad_norm": 0.946103036403656, "learning_rate": 2.7185671116875155e-05, "loss": 0.1556, "num_input_tokens_seen": 3851800, "step": 20200 }, { "epoch": 10.501559251559252, "grad_norm": 0.8612972497940063, "learning_rate": 2.7174375113554606e-05, "loss": 0.1758, "num_input_tokens_seen": 3852760, "step": 20205 }, { "epoch": 10.504158004158004, "grad_norm": 0.8727184534072876, "learning_rate": 2.7163078662913278e-05, "loss": 0.1196, "num_input_tokens_seen": 3853656, "step": 20210 }, { "epoch": 10.506756756756756, "grad_norm": 0.7319507598876953, "learning_rate": 2.7151781767275113e-05, "loss": 0.1857, "num_input_tokens_seen": 3854584, "step": 20215 }, { "epoch": 10.50935550935551, "grad_norm": 0.12029659003019333, "learning_rate": 2.7140484428964168e-05, "loss": 0.2089, "num_input_tokens_seen": 3855512, "step": 20220 }, { "epoch": 10.511954261954262, "grad_norm": 0.8636860847473145, "learning_rate": 2.7129186650304568e-05, "loss": 0.2994, "num_input_tokens_seen": 3856440, "step": 20225 }, { "epoch": 10.514553014553014, "grad_norm": 0.995278537273407, "learning_rate": 2.7117888433620537e-05, "loss": 0.2176, "num_input_tokens_seen": 3857336, "step": 20230 }, { "epoch": 10.517151767151766, "grad_norm": 0.5500096082687378, "learning_rate": 2.7106589781236386e-05, "loss": 0.1562, "num_input_tokens_seen": 3858328, "step": 20235 }, { "epoch": 10.51975051975052, "grad_norm": 0.5617042183876038, "learning_rate": 2.7095290695476517e-05, "loss": 0.0842, "num_input_tokens_seen": 3859224, "step": 20240 }, { "epoch": 10.522349272349272, "grad_norm": 1.0715841054916382, "learning_rate": 2.7083991178665417e-05, "loss": 0.2166, "num_input_tokens_seen": 3860216, "step": 20245 }, { "epoch": 10.524948024948024, "grad_norm": 0.11520032584667206, "learning_rate": 2.7072691233127673e-05, "loss": 0.0548, "num_input_tokens_seen": 3861176, "step": 20250 }, { "epoch": 10.527546777546778, "grad_norm": 0.3084385097026825, "learning_rate": 2.7061390861187953e-05, "loss": 0.1, "num_input_tokens_seen": 3862104, "step": 20255 }, { "epoch": 10.53014553014553, "grad_norm": 0.752007007598877, "learning_rate": 2.7050090065171003e-05, "loss": 0.1975, "num_input_tokens_seen": 3863032, "step": 20260 }, { "epoch": 10.532744282744282, "grad_norm": 1.5185092687606812, "learning_rate": 2.703878884740168e-05, "loss": 0.3022, "num_input_tokens_seen": 3863992, "step": 20265 }, { "epoch": 10.535343035343036, "grad_norm": 0.7433788776397705, "learning_rate": 2.7027487210204903e-05, "loss": 0.078, "num_input_tokens_seen": 3864920, "step": 20270 }, { "epoch": 10.537941787941788, "grad_norm": 0.5739290118217468, "learning_rate": 2.7016185155905694e-05, "loss": 0.161, "num_input_tokens_seen": 3865848, "step": 20275 }, { "epoch": 10.54054054054054, "grad_norm": 0.9724349975585938, "learning_rate": 2.7004882686829154e-05, "loss": 0.0878, "num_input_tokens_seen": 3866808, "step": 20280 }, { "epoch": 10.543139293139292, "grad_norm": 0.7251267433166504, "learning_rate": 2.6993579805300462e-05, "loss": 0.2189, "num_input_tokens_seen": 3867832, "step": 20285 }, { "epoch": 10.545738045738046, "grad_norm": 1.623906135559082, "learning_rate": 2.6982276513644904e-05, "loss": 0.1181, "num_input_tokens_seen": 3868760, "step": 20290 }, { "epoch": 10.548336798336798, "grad_norm": 0.7311843633651733, "learning_rate": 2.697097281418783e-05, "loss": 0.2044, "num_input_tokens_seen": 3869720, "step": 20295 }, { "epoch": 10.55093555093555, "grad_norm": 0.11489278823137283, "learning_rate": 2.695966870925468e-05, "loss": 0.1656, "num_input_tokens_seen": 3870680, "step": 20300 }, { "epoch": 10.553534303534304, "grad_norm": 1.0037188529968262, "learning_rate": 2.694836420117097e-05, "loss": 0.2056, "num_input_tokens_seen": 3871576, "step": 20305 }, { "epoch": 10.556133056133056, "grad_norm": 0.5150179862976074, "learning_rate": 2.6937059292262316e-05, "loss": 0.0507, "num_input_tokens_seen": 3872472, "step": 20310 }, { "epoch": 10.558731808731808, "grad_norm": 1.1253172159194946, "learning_rate": 2.6925753984854417e-05, "loss": 0.2346, "num_input_tokens_seen": 3873400, "step": 20315 }, { "epoch": 10.56133056133056, "grad_norm": 0.1983412802219391, "learning_rate": 2.691444828127302e-05, "loss": 0.095, "num_input_tokens_seen": 3874296, "step": 20320 }, { "epoch": 10.563929313929314, "grad_norm": 0.6812463998794556, "learning_rate": 2.6903142183844e-05, "loss": 0.144, "num_input_tokens_seen": 3875352, "step": 20325 }, { "epoch": 10.566528066528067, "grad_norm": 0.22260156273841858, "learning_rate": 2.6891835694893268e-05, "loss": 0.0967, "num_input_tokens_seen": 3876280, "step": 20330 }, { "epoch": 10.569126819126819, "grad_norm": 0.14894118905067444, "learning_rate": 2.688052881674687e-05, "loss": 0.0843, "num_input_tokens_seen": 3877336, "step": 20335 }, { "epoch": 10.571725571725572, "grad_norm": 0.6889248490333557, "learning_rate": 2.6869221551730866e-05, "loss": 0.1458, "num_input_tokens_seen": 3878200, "step": 20340 }, { "epoch": 10.574324324324325, "grad_norm": 0.6577925682067871, "learning_rate": 2.685791390217145e-05, "loss": 0.0874, "num_input_tokens_seen": 3879160, "step": 20345 }, { "epoch": 10.576923076923077, "grad_norm": 1.761611819267273, "learning_rate": 2.684660587039487e-05, "loss": 0.1187, "num_input_tokens_seen": 3880152, "step": 20350 }, { "epoch": 10.579521829521829, "grad_norm": 0.15433849394321442, "learning_rate": 2.683529745872746e-05, "loss": 0.1155, "num_input_tokens_seen": 3881112, "step": 20355 }, { "epoch": 10.582120582120583, "grad_norm": 0.8368565440177917, "learning_rate": 2.682398866949562e-05, "loss": 0.0834, "num_input_tokens_seen": 3882008, "step": 20360 }, { "epoch": 10.584719334719335, "grad_norm": 0.5727123022079468, "learning_rate": 2.6812679505025855e-05, "loss": 0.1161, "num_input_tokens_seen": 3882936, "step": 20365 }, { "epoch": 10.587318087318087, "grad_norm": 0.9133585691452026, "learning_rate": 2.680136996764472e-05, "loss": 0.1498, "num_input_tokens_seen": 3883896, "step": 20370 }, { "epoch": 10.58991683991684, "grad_norm": 0.7458679676055908, "learning_rate": 2.6790060059678845e-05, "loss": 0.2026, "num_input_tokens_seen": 3884888, "step": 20375 }, { "epoch": 10.592515592515593, "grad_norm": 0.09623579680919647, "learning_rate": 2.677874978345497e-05, "loss": 0.0851, "num_input_tokens_seen": 3885784, "step": 20380 }, { "epoch": 10.595114345114345, "grad_norm": 0.2285655438899994, "learning_rate": 2.6767439141299865e-05, "loss": 0.09, "num_input_tokens_seen": 3886808, "step": 20385 }, { "epoch": 10.597713097713097, "grad_norm": 1.393925428390503, "learning_rate": 2.675612813554042e-05, "loss": 0.1494, "num_input_tokens_seen": 3887768, "step": 20390 }, { "epoch": 10.60031185031185, "grad_norm": 0.7513118982315063, "learning_rate": 2.6744816768503566e-05, "loss": 0.204, "num_input_tokens_seen": 3888728, "step": 20395 }, { "epoch": 10.602910602910603, "grad_norm": 0.7094990015029907, "learning_rate": 2.6733505042516325e-05, "loss": 0.1633, "num_input_tokens_seen": 3889784, "step": 20400 }, { "epoch": 10.605509355509355, "grad_norm": 0.4655317962169647, "learning_rate": 2.672219295990579e-05, "loss": 0.1474, "num_input_tokens_seen": 3890776, "step": 20405 }, { "epoch": 10.608108108108109, "grad_norm": 0.13150133192539215, "learning_rate": 2.6710880522999122e-05, "loss": 0.2478, "num_input_tokens_seen": 3891736, "step": 20410 }, { "epoch": 10.61070686070686, "grad_norm": 0.1314009130001068, "learning_rate": 2.669956773412356e-05, "loss": 0.1518, "num_input_tokens_seen": 3892696, "step": 20415 }, { "epoch": 10.613305613305613, "grad_norm": 0.5412645936012268, "learning_rate": 2.668825459560641e-05, "loss": 0.0504, "num_input_tokens_seen": 3893560, "step": 20420 }, { "epoch": 10.615904365904367, "grad_norm": 0.3938758969306946, "learning_rate": 2.667694110977506e-05, "loss": 0.1476, "num_input_tokens_seen": 3894456, "step": 20425 }, { "epoch": 10.618503118503119, "grad_norm": 0.24947963654994965, "learning_rate": 2.6665627278956967e-05, "loss": 0.1453, "num_input_tokens_seen": 3895448, "step": 20430 }, { "epoch": 10.621101871101871, "grad_norm": 1.248876929283142, "learning_rate": 2.6654313105479646e-05, "loss": 0.1074, "num_input_tokens_seen": 3896344, "step": 20435 }, { "epoch": 10.623700623700623, "grad_norm": 0.716087818145752, "learning_rate": 2.6642998591670698e-05, "loss": 0.1444, "num_input_tokens_seen": 3897240, "step": 20440 }, { "epoch": 10.626299376299377, "grad_norm": 0.5398210883140564, "learning_rate": 2.663168373985779e-05, "loss": 0.1278, "num_input_tokens_seen": 3898168, "step": 20445 }, { "epoch": 10.628898128898129, "grad_norm": 0.845323920249939, "learning_rate": 2.6620368552368653e-05, "loss": 0.1706, "num_input_tokens_seen": 3899128, "step": 20450 }, { "epoch": 10.631496881496881, "grad_norm": 1.619626760482788, "learning_rate": 2.6609053031531083e-05, "loss": 0.1896, "num_input_tokens_seen": 3900056, "step": 20455 }, { "epoch": 10.634095634095633, "grad_norm": 0.7082865238189697, "learning_rate": 2.659773717967297e-05, "loss": 0.1564, "num_input_tokens_seen": 3901048, "step": 20460 }, { "epoch": 10.636694386694387, "grad_norm": 0.3215455710887909, "learning_rate": 2.6586420999122248e-05, "loss": 0.0703, "num_input_tokens_seen": 3901912, "step": 20465 }, { "epoch": 10.63929313929314, "grad_norm": 0.1219649687409401, "learning_rate": 2.657510449220692e-05, "loss": 0.0672, "num_input_tokens_seen": 3902840, "step": 20470 }, { "epoch": 10.641891891891891, "grad_norm": 0.19189788401126862, "learning_rate": 2.6563787661255057e-05, "loss": 0.0348, "num_input_tokens_seen": 3903736, "step": 20475 }, { "epoch": 10.644490644490645, "grad_norm": 1.3358871936798096, "learning_rate": 2.6552470508594812e-05, "loss": 0.1374, "num_input_tokens_seen": 3904696, "step": 20480 }, { "epoch": 10.647089397089397, "grad_norm": 0.6913952827453613, "learning_rate": 2.6541153036554385e-05, "loss": 0.0507, "num_input_tokens_seen": 3905624, "step": 20485 }, { "epoch": 10.64968814968815, "grad_norm": 0.1002219021320343, "learning_rate": 2.6529835247462048e-05, "loss": 0.1445, "num_input_tokens_seen": 3906616, "step": 20490 }, { "epoch": 10.652286902286903, "grad_norm": 1.0161060094833374, "learning_rate": 2.651851714364616e-05, "loss": 0.2213, "num_input_tokens_seen": 3907672, "step": 20495 }, { "epoch": 10.654885654885655, "grad_norm": 0.3763980567455292, "learning_rate": 2.6507198727435094e-05, "loss": 0.0854, "num_input_tokens_seen": 3908632, "step": 20500 }, { "epoch": 10.657484407484407, "grad_norm": 0.5043198466300964, "learning_rate": 2.6495880001157342e-05, "loss": 0.1033, "num_input_tokens_seen": 3909688, "step": 20505 }, { "epoch": 10.66008316008316, "grad_norm": 0.26444169878959656, "learning_rate": 2.648456096714142e-05, "loss": 0.09, "num_input_tokens_seen": 3910712, "step": 20510 }, { "epoch": 10.662681912681913, "grad_norm": 0.7333424687385559, "learning_rate": 2.647324162771594e-05, "loss": 0.1076, "num_input_tokens_seen": 3911608, "step": 20515 }, { "epoch": 10.665280665280665, "grad_norm": 1.056117057800293, "learning_rate": 2.6461921985209536e-05, "loss": 0.1757, "num_input_tokens_seen": 3912568, "step": 20520 }, { "epoch": 10.667879417879417, "grad_norm": 0.8784705400466919, "learning_rate": 2.6450602041950956e-05, "loss": 0.1588, "num_input_tokens_seen": 3913496, "step": 20525 }, { "epoch": 10.670478170478171, "grad_norm": 0.8492365479469299, "learning_rate": 2.6439281800268967e-05, "loss": 0.1676, "num_input_tokens_seen": 3914456, "step": 20530 }, { "epoch": 10.673076923076923, "grad_norm": 1.2842514514923096, "learning_rate": 2.6427961262492402e-05, "loss": 0.2404, "num_input_tokens_seen": 3915384, "step": 20535 }, { "epoch": 10.675675675675675, "grad_norm": 0.7085651755332947, "learning_rate": 2.641664043095019e-05, "loss": 0.1411, "num_input_tokens_seen": 3916344, "step": 20540 }, { "epoch": 10.678274428274428, "grad_norm": 0.27550357580184937, "learning_rate": 2.640531930797127e-05, "loss": 0.0994, "num_input_tokens_seen": 3917240, "step": 20545 }, { "epoch": 10.680873180873181, "grad_norm": 0.08435481786727905, "learning_rate": 2.6393997895884692e-05, "loss": 0.0754, "num_input_tokens_seen": 3918200, "step": 20550 }, { "epoch": 10.683471933471933, "grad_norm": 0.8918339014053345, "learning_rate": 2.638267619701952e-05, "loss": 0.0801, "num_input_tokens_seen": 3919096, "step": 20555 }, { "epoch": 10.686070686070686, "grad_norm": 0.6591753959655762, "learning_rate": 2.637135421370491e-05, "loss": 0.1847, "num_input_tokens_seen": 3920120, "step": 20560 }, { "epoch": 10.68866943866944, "grad_norm": 0.4112691283226013, "learning_rate": 2.6360031948270057e-05, "loss": 0.09, "num_input_tokens_seen": 3921112, "step": 20565 }, { "epoch": 10.691268191268192, "grad_norm": 1.1440961360931396, "learning_rate": 2.6348709403044218e-05, "loss": 0.1123, "num_input_tokens_seen": 3922008, "step": 20570 }, { "epoch": 10.693866943866944, "grad_norm": 0.12933450937271118, "learning_rate": 2.633738658035672e-05, "loss": 0.0713, "num_input_tokens_seen": 3922936, "step": 20575 }, { "epoch": 10.696465696465696, "grad_norm": 1.4913681745529175, "learning_rate": 2.6326063482536934e-05, "loss": 0.1975, "num_input_tokens_seen": 3923896, "step": 20580 }, { "epoch": 10.69906444906445, "grad_norm": 0.6785091161727905, "learning_rate": 2.631474011191429e-05, "loss": 0.137, "num_input_tokens_seen": 3924824, "step": 20585 }, { "epoch": 10.701663201663202, "grad_norm": 0.5846471786499023, "learning_rate": 2.6303416470818266e-05, "loss": 0.105, "num_input_tokens_seen": 3925816, "step": 20590 }, { "epoch": 10.704261954261954, "grad_norm": 0.8189961314201355, "learning_rate": 2.629209256157843e-05, "loss": 0.247, "num_input_tokens_seen": 3926776, "step": 20595 }, { "epoch": 10.706860706860708, "grad_norm": 0.6908302307128906, "learning_rate": 2.6280768386524346e-05, "loss": 0.1543, "num_input_tokens_seen": 3927768, "step": 20600 }, { "epoch": 10.70945945945946, "grad_norm": 1.0302790403366089, "learning_rate": 2.6269443947985695e-05, "loss": 0.1231, "num_input_tokens_seen": 3928760, "step": 20605 }, { "epoch": 10.712058212058212, "grad_norm": 0.4421966075897217, "learning_rate": 2.6258119248292168e-05, "loss": 0.0763, "num_input_tokens_seen": 3929720, "step": 20610 }, { "epoch": 10.714656964656964, "grad_norm": 0.8804928064346313, "learning_rate": 2.624679428977353e-05, "loss": 0.1787, "num_input_tokens_seen": 3930584, "step": 20615 }, { "epoch": 10.717255717255718, "grad_norm": 0.15048235654830933, "learning_rate": 2.62354690747596e-05, "loss": 0.0295, "num_input_tokens_seen": 3931480, "step": 20620 }, { "epoch": 10.71985446985447, "grad_norm": 1.2674964666366577, "learning_rate": 2.6224143605580232e-05, "loss": 0.1616, "num_input_tokens_seen": 3932344, "step": 20625 }, { "epoch": 10.722453222453222, "grad_norm": 0.7669163942337036, "learning_rate": 2.621281788456537e-05, "loss": 0.1526, "num_input_tokens_seen": 3933272, "step": 20630 }, { "epoch": 10.725051975051976, "grad_norm": 0.44325703382492065, "learning_rate": 2.620149191404495e-05, "loss": 0.1098, "num_input_tokens_seen": 3934136, "step": 20635 }, { "epoch": 10.727650727650728, "grad_norm": 0.8920312523841858, "learning_rate": 2.619016569634902e-05, "loss": 0.1559, "num_input_tokens_seen": 3935128, "step": 20640 }, { "epoch": 10.73024948024948, "grad_norm": 0.29682624340057373, "learning_rate": 2.6178839233807644e-05, "loss": 0.0658, "num_input_tokens_seen": 3936120, "step": 20645 }, { "epoch": 10.732848232848234, "grad_norm": 0.2901751399040222, "learning_rate": 2.616751252875095e-05, "loss": 0.1699, "num_input_tokens_seen": 3937112, "step": 20650 }, { "epoch": 10.735446985446986, "grad_norm": 0.09549769759178162, "learning_rate": 2.6156185583509096e-05, "loss": 0.1406, "num_input_tokens_seen": 3938040, "step": 20655 }, { "epoch": 10.738045738045738, "grad_norm": 0.5145442485809326, "learning_rate": 2.6144858400412325e-05, "loss": 0.0611, "num_input_tokens_seen": 3939000, "step": 20660 }, { "epoch": 10.74064449064449, "grad_norm": 0.595874547958374, "learning_rate": 2.61335309817909e-05, "loss": 0.1348, "num_input_tokens_seen": 3939960, "step": 20665 }, { "epoch": 10.743243243243244, "grad_norm": 0.06859639286994934, "learning_rate": 2.6122203329975132e-05, "loss": 0.1954, "num_input_tokens_seen": 3940888, "step": 20670 }, { "epoch": 10.745841995841996, "grad_norm": 0.27173760533332825, "learning_rate": 2.61108754472954e-05, "loss": 0.1463, "num_input_tokens_seen": 3941848, "step": 20675 }, { "epoch": 10.748440748440748, "grad_norm": 1.3569124937057495, "learning_rate": 2.6099547336082113e-05, "loss": 0.1507, "num_input_tokens_seen": 3942776, "step": 20680 }, { "epoch": 10.7510395010395, "grad_norm": 1.237243890762329, "learning_rate": 2.6088218998665748e-05, "loss": 0.1384, "num_input_tokens_seen": 3943704, "step": 20685 }, { "epoch": 10.753638253638254, "grad_norm": 0.14359751343727112, "learning_rate": 2.607689043737679e-05, "loss": 0.0796, "num_input_tokens_seen": 3944664, "step": 20690 }, { "epoch": 10.756237006237006, "grad_norm": 0.8716090321540833, "learning_rate": 2.6065561654545818e-05, "loss": 0.19, "num_input_tokens_seen": 3945624, "step": 20695 }, { "epoch": 10.758835758835758, "grad_norm": 0.4256066083908081, "learning_rate": 2.6054232652503414e-05, "loss": 0.0639, "num_input_tokens_seen": 3946520, "step": 20700 }, { "epoch": 10.761434511434512, "grad_norm": 0.08037793636322021, "learning_rate": 2.604290343358023e-05, "loss": 0.0689, "num_input_tokens_seen": 3947512, "step": 20705 }, { "epoch": 10.764033264033264, "grad_norm": 1.2645951509475708, "learning_rate": 2.6031574000106962e-05, "loss": 0.1341, "num_input_tokens_seen": 3948408, "step": 20710 }, { "epoch": 10.766632016632016, "grad_norm": 1.7611243724822998, "learning_rate": 2.6020244354414336e-05, "loss": 0.2517, "num_input_tokens_seen": 3949432, "step": 20715 }, { "epoch": 10.76923076923077, "grad_norm": 1.045101523399353, "learning_rate": 2.6008914498833138e-05, "loss": 0.1481, "num_input_tokens_seen": 3950360, "step": 20720 }, { "epoch": 10.771829521829522, "grad_norm": 0.7806352376937866, "learning_rate": 2.5997584435694184e-05, "loss": 0.1519, "num_input_tokens_seen": 3951352, "step": 20725 }, { "epoch": 10.774428274428274, "grad_norm": 0.7094348669052124, "learning_rate": 2.5986254167328343e-05, "loss": 0.0622, "num_input_tokens_seen": 3952312, "step": 20730 }, { "epoch": 10.777027027027026, "grad_norm": 0.3970491588115692, "learning_rate": 2.597492369606651e-05, "loss": 0.125, "num_input_tokens_seen": 3953272, "step": 20735 }, { "epoch": 10.77962577962578, "grad_norm": 0.39596161246299744, "learning_rate": 2.596359302423964e-05, "loss": 0.0715, "num_input_tokens_seen": 3954264, "step": 20740 }, { "epoch": 10.782224532224532, "grad_norm": 0.3508579432964325, "learning_rate": 2.5952262154178724e-05, "loss": 0.1749, "num_input_tokens_seen": 3955256, "step": 20745 }, { "epoch": 10.784823284823284, "grad_norm": 0.3480263650417328, "learning_rate": 2.5940931088214794e-05, "loss": 0.0465, "num_input_tokens_seen": 3956248, "step": 20750 }, { "epoch": 10.787422037422038, "grad_norm": 0.7332822680473328, "learning_rate": 2.5929599828678913e-05, "loss": 0.1879, "num_input_tokens_seen": 3957144, "step": 20755 }, { "epoch": 10.79002079002079, "grad_norm": 0.3645550608634949, "learning_rate": 2.59182683779022e-05, "loss": 0.1699, "num_input_tokens_seen": 3958104, "step": 20760 }, { "epoch": 10.792619542619542, "grad_norm": 0.588960587978363, "learning_rate": 2.59069367382158e-05, "loss": 0.0706, "num_input_tokens_seen": 3959064, "step": 20765 }, { "epoch": 10.795218295218294, "grad_norm": 1.6503219604492188, "learning_rate": 2.5895604911950895e-05, "loss": 0.264, "num_input_tokens_seen": 3960024, "step": 20770 }, { "epoch": 10.797817047817048, "grad_norm": 1.0254883766174316, "learning_rate": 2.5884272901438723e-05, "loss": 0.1001, "num_input_tokens_seen": 3960888, "step": 20775 }, { "epoch": 10.8004158004158, "grad_norm": 0.30540239810943604, "learning_rate": 2.5872940709010535e-05, "loss": 0.0959, "num_input_tokens_seen": 3961848, "step": 20780 }, { "epoch": 10.803014553014552, "grad_norm": 2.1038951873779297, "learning_rate": 2.5861608336997643e-05, "loss": 0.2166, "num_input_tokens_seen": 3962744, "step": 20785 }, { "epoch": 10.805613305613306, "grad_norm": 0.08098141849040985, "learning_rate": 2.5850275787731392e-05, "loss": 0.0993, "num_input_tokens_seen": 3963736, "step": 20790 }, { "epoch": 10.808212058212058, "grad_norm": 0.1596328765153885, "learning_rate": 2.5838943063543136e-05, "loss": 0.0825, "num_input_tokens_seen": 3964696, "step": 20795 }, { "epoch": 10.81081081081081, "grad_norm": 0.18010304868221283, "learning_rate": 2.582761016676431e-05, "loss": 0.1098, "num_input_tokens_seen": 3965624, "step": 20800 }, { "epoch": 10.813409563409563, "grad_norm": 1.1462217569351196, "learning_rate": 2.5816277099726348e-05, "loss": 0.2323, "num_input_tokens_seen": 3966616, "step": 20805 }, { "epoch": 10.816008316008316, "grad_norm": 1.4359983205795288, "learning_rate": 2.580494386476074e-05, "loss": 0.1538, "num_input_tokens_seen": 3967640, "step": 20810 }, { "epoch": 10.818607068607069, "grad_norm": 1.0058859586715698, "learning_rate": 2.5793610464198986e-05, "loss": 0.1289, "num_input_tokens_seen": 3968504, "step": 20815 }, { "epoch": 10.82120582120582, "grad_norm": 0.9295995831489563, "learning_rate": 2.5782276900372655e-05, "loss": 0.2068, "num_input_tokens_seen": 3969496, "step": 20820 }, { "epoch": 10.823804573804575, "grad_norm": 0.6646581888198853, "learning_rate": 2.5770943175613327e-05, "loss": 0.1861, "num_input_tokens_seen": 3970424, "step": 20825 }, { "epoch": 10.826403326403327, "grad_norm": 0.4740408957004547, "learning_rate": 2.5759609292252606e-05, "loss": 0.1784, "num_input_tokens_seen": 3971384, "step": 20830 }, { "epoch": 10.829002079002079, "grad_norm": 0.6113303899765015, "learning_rate": 2.5748275252622163e-05, "loss": 0.0759, "num_input_tokens_seen": 3972312, "step": 20835 }, { "epoch": 10.83160083160083, "grad_norm": 0.8580384254455566, "learning_rate": 2.573694105905366e-05, "loss": 0.1742, "num_input_tokens_seen": 3973272, "step": 20840 }, { "epoch": 10.834199584199585, "grad_norm": 0.4770044982433319, "learning_rate": 2.5725606713878825e-05, "loss": 0.1085, "num_input_tokens_seen": 3974232, "step": 20845 }, { "epoch": 10.836798336798337, "grad_norm": 1.0638999938964844, "learning_rate": 2.5714272219429396e-05, "loss": 0.225, "num_input_tokens_seen": 3975224, "step": 20850 }, { "epoch": 10.839397089397089, "grad_norm": 1.0880471467971802, "learning_rate": 2.5702937578037156e-05, "loss": 0.0887, "num_input_tokens_seen": 3976248, "step": 20855 }, { "epoch": 10.841995841995843, "grad_norm": 0.8217376470565796, "learning_rate": 2.5691602792033897e-05, "loss": 0.1501, "num_input_tokens_seen": 3977304, "step": 20860 }, { "epoch": 10.844594594594595, "grad_norm": 0.3535594046115875, "learning_rate": 2.568026786375147e-05, "loss": 0.0848, "num_input_tokens_seen": 3978264, "step": 20865 }, { "epoch": 10.847193347193347, "grad_norm": 0.2621106505393982, "learning_rate": 2.5668932795521728e-05, "loss": 0.0767, "num_input_tokens_seen": 3979224, "step": 20870 }, { "epoch": 10.8497920997921, "grad_norm": 0.40835726261138916, "learning_rate": 2.5657597589676567e-05, "loss": 0.0343, "num_input_tokens_seen": 3980216, "step": 20875 }, { "epoch": 10.852390852390853, "grad_norm": 0.2560855746269226, "learning_rate": 2.5646262248547915e-05, "loss": 0.0943, "num_input_tokens_seen": 3981112, "step": 20880 }, { "epoch": 10.854989604989605, "grad_norm": 1.0382887125015259, "learning_rate": 2.5634926774467715e-05, "loss": 0.1122, "num_input_tokens_seen": 3982072, "step": 20885 }, { "epoch": 10.857588357588357, "grad_norm": 0.5483883023262024, "learning_rate": 2.562359116976796e-05, "loss": 0.0886, "num_input_tokens_seen": 3983064, "step": 20890 }, { "epoch": 10.86018711018711, "grad_norm": 1.6001311540603638, "learning_rate": 2.561225543678063e-05, "loss": 0.1672, "num_input_tokens_seen": 3984024, "step": 20895 }, { "epoch": 10.862785862785863, "grad_norm": 0.4594760835170746, "learning_rate": 2.5600919577837768e-05, "loss": 0.1752, "num_input_tokens_seen": 3984952, "step": 20900 }, { "epoch": 10.865384615384615, "grad_norm": 0.9960254430770874, "learning_rate": 2.5589583595271428e-05, "loss": 0.2876, "num_input_tokens_seen": 3985944, "step": 20905 }, { "epoch": 10.867983367983367, "grad_norm": 0.7710526585578918, "learning_rate": 2.5578247491413693e-05, "loss": 0.0916, "num_input_tokens_seen": 3986904, "step": 20910 }, { "epoch": 10.870582120582121, "grad_norm": 0.7855666279792786, "learning_rate": 2.5566911268596666e-05, "loss": 0.0613, "num_input_tokens_seen": 3987864, "step": 20915 }, { "epoch": 10.873180873180873, "grad_norm": 1.1302266120910645, "learning_rate": 2.555557492915249e-05, "loss": 0.2001, "num_input_tokens_seen": 3988792, "step": 20920 }, { "epoch": 10.875779625779625, "grad_norm": 1.1491812467575073, "learning_rate": 2.5544238475413307e-05, "loss": 0.121, "num_input_tokens_seen": 3989688, "step": 20925 }, { "epoch": 10.878378378378379, "grad_norm": 0.0876876637339592, "learning_rate": 2.55329019097113e-05, "loss": 0.1198, "num_input_tokens_seen": 3990648, "step": 20930 }, { "epoch": 10.880977130977131, "grad_norm": 0.42579910159111023, "learning_rate": 2.552156523437867e-05, "loss": 0.0812, "num_input_tokens_seen": 3991576, "step": 20935 }, { "epoch": 10.883575883575883, "grad_norm": 0.2053472101688385, "learning_rate": 2.5510228451747643e-05, "loss": 0.1204, "num_input_tokens_seen": 3992536, "step": 20940 }, { "epoch": 10.886174636174637, "grad_norm": 0.578948438167572, "learning_rate": 2.5498891564150468e-05, "loss": 0.0653, "num_input_tokens_seen": 3993432, "step": 20945 }, { "epoch": 10.888773388773389, "grad_norm": 2.058539628982544, "learning_rate": 2.5487554573919402e-05, "loss": 0.1955, "num_input_tokens_seen": 3994296, "step": 20950 }, { "epoch": 10.891372141372141, "grad_norm": 1.4677397012710571, "learning_rate": 2.547621748338675e-05, "loss": 0.2077, "num_input_tokens_seen": 3995256, "step": 20955 }, { "epoch": 10.893970893970893, "grad_norm": 0.1495295614004135, "learning_rate": 2.5464880294884813e-05, "loss": 0.2015, "num_input_tokens_seen": 3996152, "step": 20960 }, { "epoch": 10.896569646569647, "grad_norm": 0.16910681128501892, "learning_rate": 2.5453543010745918e-05, "loss": 0.1239, "num_input_tokens_seen": 3997112, "step": 20965 }, { "epoch": 10.8991683991684, "grad_norm": 0.9370072484016418, "learning_rate": 2.5442205633302417e-05, "loss": 0.1923, "num_input_tokens_seen": 3998072, "step": 20970 }, { "epoch": 10.901767151767151, "grad_norm": 0.7199063897132874, "learning_rate": 2.543086816488668e-05, "loss": 0.1507, "num_input_tokens_seen": 3998936, "step": 20975 }, { "epoch": 10.904365904365905, "grad_norm": 1.1245403289794922, "learning_rate": 2.54195306078311e-05, "loss": 0.3148, "num_input_tokens_seen": 3999896, "step": 20980 }, { "epoch": 10.906964656964657, "grad_norm": 0.7904741764068604, "learning_rate": 2.5408192964468068e-05, "loss": 0.1085, "num_input_tokens_seen": 4000856, "step": 20985 }, { "epoch": 10.90956340956341, "grad_norm": 0.3435867428779602, "learning_rate": 2.539685523713002e-05, "loss": 0.1451, "num_input_tokens_seen": 4001784, "step": 20990 }, { "epoch": 10.912162162162161, "grad_norm": 0.23022840917110443, "learning_rate": 2.5385517428149398e-05, "loss": 0.1186, "num_input_tokens_seen": 4002776, "step": 20995 }, { "epoch": 10.914760914760915, "grad_norm": 1.2691327333450317, "learning_rate": 2.5374179539858645e-05, "loss": 0.1959, "num_input_tokens_seen": 4003736, "step": 21000 }, { "epoch": 10.917359667359667, "grad_norm": 0.31390517950057983, "learning_rate": 2.5362841574590247e-05, "loss": 0.0425, "num_input_tokens_seen": 4004728, "step": 21005 }, { "epoch": 10.91995841995842, "grad_norm": 0.5055538415908813, "learning_rate": 2.535150353467669e-05, "loss": 0.067, "num_input_tokens_seen": 4005688, "step": 21010 }, { "epoch": 10.922557172557173, "grad_norm": 0.3024044334888458, "learning_rate": 2.534016542245048e-05, "loss": 0.1991, "num_input_tokens_seen": 4006808, "step": 21015 }, { "epoch": 10.925155925155925, "grad_norm": 0.3944263160228729, "learning_rate": 2.5328827240244142e-05, "loss": 0.1933, "num_input_tokens_seen": 4007768, "step": 21020 }, { "epoch": 10.927754677754677, "grad_norm": 0.10846442729234695, "learning_rate": 2.5317488990390204e-05, "loss": 0.0622, "num_input_tokens_seen": 4008728, "step": 21025 }, { "epoch": 10.93035343035343, "grad_norm": 0.2714587152004242, "learning_rate": 2.5306150675221203e-05, "loss": 0.0614, "num_input_tokens_seen": 4009656, "step": 21030 }, { "epoch": 10.932952182952183, "grad_norm": 1.2573699951171875, "learning_rate": 2.5294812297069725e-05, "loss": 0.1592, "num_input_tokens_seen": 4010552, "step": 21035 }, { "epoch": 10.935550935550935, "grad_norm": 0.4530680477619171, "learning_rate": 2.5283473858268327e-05, "loss": 0.0483, "num_input_tokens_seen": 4011576, "step": 21040 }, { "epoch": 10.938149688149688, "grad_norm": 0.7775524854660034, "learning_rate": 2.5272135361149598e-05, "loss": 0.1133, "num_input_tokens_seen": 4012536, "step": 21045 }, { "epoch": 10.940748440748441, "grad_norm": 0.394927978515625, "learning_rate": 2.5260796808046145e-05, "loss": 0.2733, "num_input_tokens_seen": 4013432, "step": 21050 }, { "epoch": 10.943347193347194, "grad_norm": 0.6337105631828308, "learning_rate": 2.524945820129057e-05, "loss": 0.1909, "num_input_tokens_seen": 4014456, "step": 21055 }, { "epoch": 10.945945945945946, "grad_norm": 0.65855473279953, "learning_rate": 2.5238119543215504e-05, "loss": 0.0709, "num_input_tokens_seen": 4015288, "step": 21060 }, { "epoch": 10.948544698544698, "grad_norm": 0.19500483572483063, "learning_rate": 2.5226780836153567e-05, "loss": 0.073, "num_input_tokens_seen": 4016248, "step": 21065 }, { "epoch": 10.951143451143452, "grad_norm": 0.5642194151878357, "learning_rate": 2.5215442082437418e-05, "loss": 0.1161, "num_input_tokens_seen": 4017176, "step": 21070 }, { "epoch": 10.953742203742204, "grad_norm": 0.6629195213317871, "learning_rate": 2.5204103284399687e-05, "loss": 0.1, "num_input_tokens_seen": 4018168, "step": 21075 }, { "epoch": 10.956340956340956, "grad_norm": 0.6745949983596802, "learning_rate": 2.5192764444373056e-05, "loss": 0.0949, "num_input_tokens_seen": 4019128, "step": 21080 }, { "epoch": 10.95893970893971, "grad_norm": 1.374239206314087, "learning_rate": 2.5181425564690182e-05, "loss": 0.1984, "num_input_tokens_seen": 4020024, "step": 21085 }, { "epoch": 10.961538461538462, "grad_norm": 0.1432693600654602, "learning_rate": 2.5170086647683743e-05, "loss": 0.1633, "num_input_tokens_seen": 4021016, "step": 21090 }, { "epoch": 10.964137214137214, "grad_norm": 0.2536642551422119, "learning_rate": 2.515874769568643e-05, "loss": 0.167, "num_input_tokens_seen": 4021976, "step": 21095 }, { "epoch": 10.966735966735968, "grad_norm": 0.6889697909355164, "learning_rate": 2.514740871103093e-05, "loss": 0.1029, "num_input_tokens_seen": 4022904, "step": 21100 }, { "epoch": 10.96933471933472, "grad_norm": 0.48188191652297974, "learning_rate": 2.513606969604995e-05, "loss": 0.0645, "num_input_tokens_seen": 4023832, "step": 21105 }, { "epoch": 10.971933471933472, "grad_norm": 0.40840354561805725, "learning_rate": 2.5124730653076184e-05, "loss": 0.1011, "num_input_tokens_seen": 4024792, "step": 21110 }, { "epoch": 10.974532224532224, "grad_norm": 0.3599551022052765, "learning_rate": 2.5113391584442357e-05, "loss": 0.1642, "num_input_tokens_seen": 4025688, "step": 21115 }, { "epoch": 10.977130977130978, "grad_norm": 0.5764984488487244, "learning_rate": 2.510205249248118e-05, "loss": 0.103, "num_input_tokens_seen": 4026648, "step": 21120 }, { "epoch": 10.97972972972973, "grad_norm": 0.9007591009140015, "learning_rate": 2.5090713379525366e-05, "loss": 0.1691, "num_input_tokens_seen": 4027576, "step": 21125 }, { "epoch": 10.982328482328482, "grad_norm": 0.16062919795513153, "learning_rate": 2.5079374247907656e-05, "loss": 0.1108, "num_input_tokens_seen": 4028632, "step": 21130 }, { "epoch": 10.984927234927234, "grad_norm": 0.5434979796409607, "learning_rate": 2.5068035099960767e-05, "loss": 0.1601, "num_input_tokens_seen": 4029592, "step": 21135 }, { "epoch": 10.987525987525988, "grad_norm": 0.14069321751594543, "learning_rate": 2.505669593801744e-05, "loss": 0.0724, "num_input_tokens_seen": 4030520, "step": 21140 }, { "epoch": 10.99012474012474, "grad_norm": 0.19228145480155945, "learning_rate": 2.5045356764410398e-05, "loss": 0.078, "num_input_tokens_seen": 4031480, "step": 21145 }, { "epoch": 10.992723492723492, "grad_norm": 0.6397374868392944, "learning_rate": 2.503401758147241e-05, "loss": 0.0898, "num_input_tokens_seen": 4032344, "step": 21150 }, { "epoch": 10.995322245322246, "grad_norm": 0.9263661503791809, "learning_rate": 2.5022678391536175e-05, "loss": 0.211, "num_input_tokens_seen": 4033240, "step": 21155 }, { "epoch": 10.997920997920998, "grad_norm": 0.7718605399131775, "learning_rate": 2.5011339196934465e-05, "loss": 0.1826, "num_input_tokens_seen": 4034200, "step": 21160 }, { "epoch": 11.0, "eval_loss": 0.14659900963306427, "eval_runtime": 8.016, "eval_samples_per_second": 106.787, "eval_steps_per_second": 26.697, "num_input_tokens_seen": 4034856, "step": 21164 }, { "epoch": 11.00051975051975, "grad_norm": 0.636667788028717, "learning_rate": 2.5e-05, "loss": 0.0953, "num_input_tokens_seen": 4035080, "step": 21165 }, { "epoch": 11.003118503118504, "grad_norm": 0.962470293045044, "learning_rate": 2.498866080306554e-05, "loss": 0.1677, "num_input_tokens_seen": 4036072, "step": 21170 }, { "epoch": 11.005717255717256, "grad_norm": 0.20088495314121246, "learning_rate": 2.497732160846383e-05, "loss": 0.0514, "num_input_tokens_seen": 4037000, "step": 21175 }, { "epoch": 11.008316008316008, "grad_norm": 1.1347168684005737, "learning_rate": 2.4965982418527597e-05, "loss": 0.3049, "num_input_tokens_seen": 4037896, "step": 21180 }, { "epoch": 11.01091476091476, "grad_norm": 0.7548483610153198, "learning_rate": 2.4954643235589598e-05, "loss": 0.0724, "num_input_tokens_seen": 4038824, "step": 21185 }, { "epoch": 11.013513513513514, "grad_norm": 0.46553850173950195, "learning_rate": 2.494330406198256e-05, "loss": 0.1814, "num_input_tokens_seen": 4039752, "step": 21190 }, { "epoch": 11.016112266112266, "grad_norm": 0.743008017539978, "learning_rate": 2.4931964900039235e-05, "loss": 0.0824, "num_input_tokens_seen": 4040712, "step": 21195 }, { "epoch": 11.018711018711018, "grad_norm": 1.0314478874206543, "learning_rate": 2.4920625752092356e-05, "loss": 0.1937, "num_input_tokens_seen": 4041640, "step": 21200 }, { "epoch": 11.021309771309772, "grad_norm": 1.353775143623352, "learning_rate": 2.4909286620474636e-05, "loss": 0.1361, "num_input_tokens_seen": 4042600, "step": 21205 }, { "epoch": 11.023908523908524, "grad_norm": 0.29573342204093933, "learning_rate": 2.4897947507518828e-05, "loss": 0.0726, "num_input_tokens_seen": 4043560, "step": 21210 }, { "epoch": 11.026507276507276, "grad_norm": 0.26673969626426697, "learning_rate": 2.4886608415557645e-05, "loss": 0.1741, "num_input_tokens_seen": 4044584, "step": 21215 }, { "epoch": 11.029106029106028, "grad_norm": 0.4710622727870941, "learning_rate": 2.487526934692382e-05, "loss": 0.0932, "num_input_tokens_seen": 4045448, "step": 21220 }, { "epoch": 11.031704781704782, "grad_norm": 0.10016869753599167, "learning_rate": 2.4863930303950055e-05, "loss": 0.1141, "num_input_tokens_seen": 4046408, "step": 21225 }, { "epoch": 11.034303534303534, "grad_norm": 0.1767250895500183, "learning_rate": 2.4852591288969076e-05, "loss": 0.1283, "num_input_tokens_seen": 4047336, "step": 21230 }, { "epoch": 11.036902286902286, "grad_norm": 0.7284164428710938, "learning_rate": 2.4841252304313575e-05, "loss": 0.1216, "num_input_tokens_seen": 4048232, "step": 21235 }, { "epoch": 11.03950103950104, "grad_norm": 0.1688842624425888, "learning_rate": 2.4829913352316263e-05, "loss": 0.1578, "num_input_tokens_seen": 4049192, "step": 21240 }, { "epoch": 11.042099792099792, "grad_norm": 1.7632088661193848, "learning_rate": 2.4818574435309824e-05, "loss": 0.3492, "num_input_tokens_seen": 4050152, "step": 21245 }, { "epoch": 11.044698544698544, "grad_norm": 0.7703474760055542, "learning_rate": 2.480723555562695e-05, "loss": 0.082, "num_input_tokens_seen": 4051176, "step": 21250 }, { "epoch": 11.047297297297296, "grad_norm": 0.10249638557434082, "learning_rate": 2.4795896715600312e-05, "loss": 0.1278, "num_input_tokens_seen": 4052168, "step": 21255 }, { "epoch": 11.04989604989605, "grad_norm": 0.3577013313770294, "learning_rate": 2.478455791756259e-05, "loss": 0.1471, "num_input_tokens_seen": 4053096, "step": 21260 }, { "epoch": 11.052494802494802, "grad_norm": 1.009680986404419, "learning_rate": 2.4773219163846442e-05, "loss": 0.206, "num_input_tokens_seen": 4053992, "step": 21265 }, { "epoch": 11.055093555093555, "grad_norm": 0.4748879671096802, "learning_rate": 2.4761880456784505e-05, "loss": 0.1376, "num_input_tokens_seen": 4054888, "step": 21270 }, { "epoch": 11.057692307692308, "grad_norm": 1.0690882205963135, "learning_rate": 2.4750541798709438e-05, "loss": 0.1817, "num_input_tokens_seen": 4055784, "step": 21275 }, { "epoch": 11.06029106029106, "grad_norm": 0.6356944441795349, "learning_rate": 2.4739203191953865e-05, "loss": 0.0943, "num_input_tokens_seen": 4056648, "step": 21280 }, { "epoch": 11.062889812889813, "grad_norm": 0.4587838053703308, "learning_rate": 2.472786463885041e-05, "loss": 0.1399, "num_input_tokens_seen": 4057640, "step": 21285 }, { "epoch": 11.065488565488565, "grad_norm": 0.8540288209915161, "learning_rate": 2.4716526141731683e-05, "loss": 0.046, "num_input_tokens_seen": 4058568, "step": 21290 }, { "epoch": 11.068087318087318, "grad_norm": 1.0317065715789795, "learning_rate": 2.470518770293028e-05, "loss": 0.1776, "num_input_tokens_seen": 4059560, "step": 21295 }, { "epoch": 11.07068607068607, "grad_norm": 0.5326920747756958, "learning_rate": 2.46938493247788e-05, "loss": 0.1326, "num_input_tokens_seen": 4060520, "step": 21300 }, { "epoch": 11.073284823284823, "grad_norm": 0.083225317299366, "learning_rate": 2.4682511009609805e-05, "loss": 0.077, "num_input_tokens_seen": 4061480, "step": 21305 }, { "epoch": 11.075883575883577, "grad_norm": 0.7784639000892639, "learning_rate": 2.4671172759755864e-05, "loss": 0.1869, "num_input_tokens_seen": 4062376, "step": 21310 }, { "epoch": 11.078482328482329, "grad_norm": 0.2568117380142212, "learning_rate": 2.4659834577549517e-05, "loss": 0.1815, "num_input_tokens_seen": 4063432, "step": 21315 }, { "epoch": 11.08108108108108, "grad_norm": 0.40809670090675354, "learning_rate": 2.464849646532331e-05, "loss": 0.1076, "num_input_tokens_seen": 4064360, "step": 21320 }, { "epoch": 11.083679833679835, "grad_norm": 0.8830186128616333, "learning_rate": 2.4637158425409755e-05, "loss": 0.135, "num_input_tokens_seen": 4065320, "step": 21325 }, { "epoch": 11.086278586278587, "grad_norm": 0.2231680005788803, "learning_rate": 2.4625820460141364e-05, "loss": 0.0551, "num_input_tokens_seen": 4066248, "step": 21330 }, { "epoch": 11.088877338877339, "grad_norm": 0.5207663774490356, "learning_rate": 2.4614482571850615e-05, "loss": 0.177, "num_input_tokens_seen": 4067208, "step": 21335 }, { "epoch": 11.09147609147609, "grad_norm": 0.47147050499916077, "learning_rate": 2.4603144762869984e-05, "loss": 0.1059, "num_input_tokens_seen": 4068200, "step": 21340 }, { "epoch": 11.094074844074845, "grad_norm": 0.227671816945076, "learning_rate": 2.4591807035531938e-05, "loss": 0.1203, "num_input_tokens_seen": 4069128, "step": 21345 }, { "epoch": 11.096673596673597, "grad_norm": 0.710444450378418, "learning_rate": 2.4580469392168907e-05, "loss": 0.1862, "num_input_tokens_seen": 4070120, "step": 21350 }, { "epoch": 11.099272349272349, "grad_norm": 1.1796220541000366, "learning_rate": 2.4569131835113323e-05, "loss": 0.2669, "num_input_tokens_seen": 4071208, "step": 21355 }, { "epoch": 11.101871101871103, "grad_norm": 0.8255848288536072, "learning_rate": 2.4557794366697586e-05, "loss": 0.2015, "num_input_tokens_seen": 4072168, "step": 21360 }, { "epoch": 11.104469854469855, "grad_norm": 0.5612479448318481, "learning_rate": 2.4546456989254088e-05, "loss": 0.0726, "num_input_tokens_seen": 4073096, "step": 21365 }, { "epoch": 11.107068607068607, "grad_norm": 0.9967260360717773, "learning_rate": 2.4535119705115193e-05, "loss": 0.114, "num_input_tokens_seen": 4074024, "step": 21370 }, { "epoch": 11.109667359667359, "grad_norm": 1.016950011253357, "learning_rate": 2.4523782516613253e-05, "loss": 0.1001, "num_input_tokens_seen": 4075048, "step": 21375 }, { "epoch": 11.112266112266113, "grad_norm": 1.0470184087753296, "learning_rate": 2.4512445426080597e-05, "loss": 0.105, "num_input_tokens_seen": 4075944, "step": 21380 }, { "epoch": 11.114864864864865, "grad_norm": 1.22797691822052, "learning_rate": 2.4501108435849538e-05, "loss": 0.0932, "num_input_tokens_seen": 4076936, "step": 21385 }, { "epoch": 11.117463617463617, "grad_norm": 1.0550811290740967, "learning_rate": 2.448977154825236e-05, "loss": 0.2459, "num_input_tokens_seen": 4077896, "step": 21390 }, { "epoch": 11.12006237006237, "grad_norm": 0.63236403465271, "learning_rate": 2.447843476562134e-05, "loss": 0.173, "num_input_tokens_seen": 4078824, "step": 21395 }, { "epoch": 11.122661122661123, "grad_norm": 0.8132111430168152, "learning_rate": 2.4467098090288714e-05, "loss": 0.1291, "num_input_tokens_seen": 4079816, "step": 21400 }, { "epoch": 11.125259875259875, "grad_norm": 0.9807899594306946, "learning_rate": 2.4455761524586705e-05, "loss": 0.1522, "num_input_tokens_seen": 4080808, "step": 21405 }, { "epoch": 11.127858627858627, "grad_norm": 0.12034203857183456, "learning_rate": 2.4444425070847524e-05, "loss": 0.1111, "num_input_tokens_seen": 4081800, "step": 21410 }, { "epoch": 11.130457380457381, "grad_norm": 0.9926587343215942, "learning_rate": 2.443308873140334e-05, "loss": 0.1421, "num_input_tokens_seen": 4082824, "step": 21415 }, { "epoch": 11.133056133056133, "grad_norm": 0.38739562034606934, "learning_rate": 2.4421752508586313e-05, "loss": 0.0761, "num_input_tokens_seen": 4083688, "step": 21420 }, { "epoch": 11.135654885654885, "grad_norm": 0.8680197596549988, "learning_rate": 2.441041640472858e-05, "loss": 0.1684, "num_input_tokens_seen": 4084680, "step": 21425 }, { "epoch": 11.138253638253639, "grad_norm": 0.272628515958786, "learning_rate": 2.4399080422162238e-05, "loss": 0.1227, "num_input_tokens_seen": 4085608, "step": 21430 }, { "epoch": 11.140852390852391, "grad_norm": 0.2968934178352356, "learning_rate": 2.4387744563219377e-05, "loss": 0.0416, "num_input_tokens_seen": 4086504, "step": 21435 }, { "epoch": 11.143451143451143, "grad_norm": 0.1253998875617981, "learning_rate": 2.4376408830232043e-05, "loss": 0.064, "num_input_tokens_seen": 4087464, "step": 21440 }, { "epoch": 11.146049896049895, "grad_norm": 0.5365723967552185, "learning_rate": 2.4365073225532284e-05, "loss": 0.108, "num_input_tokens_seen": 4088392, "step": 21445 }, { "epoch": 11.14864864864865, "grad_norm": 1.787402868270874, "learning_rate": 2.4353737751452084e-05, "loss": 0.1103, "num_input_tokens_seen": 4089288, "step": 21450 }, { "epoch": 11.151247401247401, "grad_norm": 0.5744081139564514, "learning_rate": 2.4342402410323435e-05, "loss": 0.073, "num_input_tokens_seen": 4090248, "step": 21455 }, { "epoch": 11.153846153846153, "grad_norm": 0.4493429660797119, "learning_rate": 2.4331067204478285e-05, "loss": 0.1181, "num_input_tokens_seen": 4091208, "step": 21460 }, { "epoch": 11.156444906444907, "grad_norm": 1.4416866302490234, "learning_rate": 2.4319732136248537e-05, "loss": 0.2964, "num_input_tokens_seen": 4092168, "step": 21465 }, { "epoch": 11.15904365904366, "grad_norm": 0.1701095849275589, "learning_rate": 2.4308397207966112e-05, "loss": 0.1526, "num_input_tokens_seen": 4093160, "step": 21470 }, { "epoch": 11.161642411642411, "grad_norm": 0.6905718445777893, "learning_rate": 2.429706242196285e-05, "loss": 0.1599, "num_input_tokens_seen": 4094088, "step": 21475 }, { "epoch": 11.164241164241163, "grad_norm": 0.5133528709411621, "learning_rate": 2.428572778057061e-05, "loss": 0.0659, "num_input_tokens_seen": 4095016, "step": 21480 }, { "epoch": 11.166839916839917, "grad_norm": 0.24431438744068146, "learning_rate": 2.4274393286121177e-05, "loss": 0.0336, "num_input_tokens_seen": 4095976, "step": 21485 }, { "epoch": 11.16943866943867, "grad_norm": 0.18676795065402985, "learning_rate": 2.4263058940946346e-05, "loss": 0.0626, "num_input_tokens_seen": 4096968, "step": 21490 }, { "epoch": 11.172037422037421, "grad_norm": 0.46365469694137573, "learning_rate": 2.4251724747377843e-05, "loss": 0.1182, "num_input_tokens_seen": 4097928, "step": 21495 }, { "epoch": 11.174636174636175, "grad_norm": 0.9684801697731018, "learning_rate": 2.42403907077474e-05, "loss": 0.1158, "num_input_tokens_seen": 4098920, "step": 21500 }, { "epoch": 11.177234927234927, "grad_norm": 0.5710857510566711, "learning_rate": 2.4229056824386682e-05, "loss": 0.163, "num_input_tokens_seen": 4099816, "step": 21505 }, { "epoch": 11.17983367983368, "grad_norm": 0.8904186487197876, "learning_rate": 2.4217723099627348e-05, "loss": 0.1453, "num_input_tokens_seen": 4100744, "step": 21510 }, { "epoch": 11.182432432432432, "grad_norm": 0.6997168660163879, "learning_rate": 2.4206389535801014e-05, "loss": 0.1234, "num_input_tokens_seen": 4101672, "step": 21515 }, { "epoch": 11.185031185031185, "grad_norm": 0.37263867259025574, "learning_rate": 2.419505613523927e-05, "loss": 0.1454, "num_input_tokens_seen": 4102632, "step": 21520 }, { "epoch": 11.187629937629938, "grad_norm": 0.48540371656417847, "learning_rate": 2.4183722900273664e-05, "loss": 0.1475, "num_input_tokens_seen": 4103528, "step": 21525 }, { "epoch": 11.19022869022869, "grad_norm": 1.0246264934539795, "learning_rate": 2.4172389833235698e-05, "loss": 0.172, "num_input_tokens_seen": 4104520, "step": 21530 }, { "epoch": 11.192827442827443, "grad_norm": 2.011350154876709, "learning_rate": 2.4161056936456873e-05, "loss": 0.251, "num_input_tokens_seen": 4105480, "step": 21535 }, { "epoch": 11.195426195426196, "grad_norm": 0.5380877256393433, "learning_rate": 2.414972421226862e-05, "loss": 0.0727, "num_input_tokens_seen": 4106440, "step": 21540 }, { "epoch": 11.198024948024948, "grad_norm": 0.14233605563640594, "learning_rate": 2.413839166300236e-05, "loss": 0.1, "num_input_tokens_seen": 4107368, "step": 21545 }, { "epoch": 11.200623700623701, "grad_norm": 0.9493620991706848, "learning_rate": 2.4127059290989474e-05, "loss": 0.2099, "num_input_tokens_seen": 4108328, "step": 21550 }, { "epoch": 11.203222453222454, "grad_norm": 1.4759503602981567, "learning_rate": 2.4115727098561286e-05, "loss": 0.1913, "num_input_tokens_seen": 4109352, "step": 21555 }, { "epoch": 11.205821205821206, "grad_norm": 0.25044968724250793, "learning_rate": 2.410439508804911e-05, "loss": 0.1511, "num_input_tokens_seen": 4110248, "step": 21560 }, { "epoch": 11.208419958419958, "grad_norm": 0.6552756428718567, "learning_rate": 2.4093063261784206e-05, "loss": 0.1261, "num_input_tokens_seen": 4111208, "step": 21565 }, { "epoch": 11.211018711018712, "grad_norm": 0.7332101464271545, "learning_rate": 2.4081731622097807e-05, "loss": 0.1504, "num_input_tokens_seen": 4112296, "step": 21570 }, { "epoch": 11.213617463617464, "grad_norm": 1.830505609512329, "learning_rate": 2.4070400171321086e-05, "loss": 0.1667, "num_input_tokens_seen": 4113224, "step": 21575 }, { "epoch": 11.216216216216216, "grad_norm": 1.287129282951355, "learning_rate": 2.4059068911785208e-05, "loss": 0.2251, "num_input_tokens_seen": 4114184, "step": 21580 }, { "epoch": 11.21881496881497, "grad_norm": 1.3979140520095825, "learning_rate": 2.404773784582127e-05, "loss": 0.1454, "num_input_tokens_seen": 4115112, "step": 21585 }, { "epoch": 11.221413721413722, "grad_norm": 1.6155548095703125, "learning_rate": 2.403640697576036e-05, "loss": 0.1045, "num_input_tokens_seen": 4116168, "step": 21590 }, { "epoch": 11.224012474012474, "grad_norm": 0.08623145520687103, "learning_rate": 2.4025076303933503e-05, "loss": 0.1382, "num_input_tokens_seen": 4117096, "step": 21595 }, { "epoch": 11.226611226611226, "grad_norm": 0.2944251298904419, "learning_rate": 2.401374583267167e-05, "loss": 0.122, "num_input_tokens_seen": 4118088, "step": 21600 }, { "epoch": 11.22920997920998, "grad_norm": 1.0883002281188965, "learning_rate": 2.4002415564305825e-05, "loss": 0.1701, "num_input_tokens_seen": 4119016, "step": 21605 }, { "epoch": 11.231808731808732, "grad_norm": 0.42180687189102173, "learning_rate": 2.3991085501166868e-05, "loss": 0.1236, "num_input_tokens_seen": 4120008, "step": 21610 }, { "epoch": 11.234407484407484, "grad_norm": 0.200786292552948, "learning_rate": 2.397975564558567e-05, "loss": 0.0739, "num_input_tokens_seen": 4120872, "step": 21615 }, { "epoch": 11.237006237006238, "grad_norm": 0.4316648244857788, "learning_rate": 2.3968425999893044e-05, "loss": 0.0586, "num_input_tokens_seen": 4121864, "step": 21620 }, { "epoch": 11.23960498960499, "grad_norm": 0.267889142036438, "learning_rate": 2.3957096566419775e-05, "loss": 0.0636, "num_input_tokens_seen": 4122792, "step": 21625 }, { "epoch": 11.242203742203742, "grad_norm": 0.948565661907196, "learning_rate": 2.3945767347496588e-05, "loss": 0.1124, "num_input_tokens_seen": 4123752, "step": 21630 }, { "epoch": 11.244802494802494, "grad_norm": 0.47425180673599243, "learning_rate": 2.393443834545419e-05, "loss": 0.1455, "num_input_tokens_seen": 4124680, "step": 21635 }, { "epoch": 11.247401247401248, "grad_norm": 0.9705814719200134, "learning_rate": 2.392310956262321e-05, "loss": 0.1493, "num_input_tokens_seen": 4125608, "step": 21640 }, { "epoch": 11.25, "grad_norm": 0.7047691345214844, "learning_rate": 2.391178100133426e-05, "loss": 0.162, "num_input_tokens_seen": 4126664, "step": 21645 }, { "epoch": 11.252598752598752, "grad_norm": 0.6048672199249268, "learning_rate": 2.3900452663917886e-05, "loss": 0.0919, "num_input_tokens_seen": 4127592, "step": 21650 }, { "epoch": 11.255197505197506, "grad_norm": 0.1324310451745987, "learning_rate": 2.38891245527046e-05, "loss": 0.0856, "num_input_tokens_seen": 4128552, "step": 21655 }, { "epoch": 11.257796257796258, "grad_norm": 0.47637003660202026, "learning_rate": 2.3877796670024884e-05, "loss": 0.1153, "num_input_tokens_seen": 4129544, "step": 21660 }, { "epoch": 11.26039501039501, "grad_norm": 0.8148508071899414, "learning_rate": 2.3866469018209116e-05, "loss": 0.0665, "num_input_tokens_seen": 4130408, "step": 21665 }, { "epoch": 11.262993762993762, "grad_norm": 0.48184940218925476, "learning_rate": 2.385514159958768e-05, "loss": 0.1343, "num_input_tokens_seen": 4131336, "step": 21670 }, { "epoch": 11.265592515592516, "grad_norm": 0.3601316809654236, "learning_rate": 2.384381441649091e-05, "loss": 0.1639, "num_input_tokens_seen": 4132264, "step": 21675 }, { "epoch": 11.268191268191268, "grad_norm": 0.7489213943481445, "learning_rate": 2.383248747124906e-05, "loss": 0.1175, "num_input_tokens_seen": 4133192, "step": 21680 }, { "epoch": 11.27079002079002, "grad_norm": 0.7446514964103699, "learning_rate": 2.3821160766192362e-05, "loss": 0.1015, "num_input_tokens_seen": 4134152, "step": 21685 }, { "epoch": 11.273388773388774, "grad_norm": 0.26672402024269104, "learning_rate": 2.380983430365098e-05, "loss": 0.1148, "num_input_tokens_seen": 4135112, "step": 21690 }, { "epoch": 11.275987525987526, "grad_norm": 0.26389235258102417, "learning_rate": 2.3798508085955055e-05, "loss": 0.1887, "num_input_tokens_seen": 4136072, "step": 21695 }, { "epoch": 11.278586278586278, "grad_norm": 0.8925632834434509, "learning_rate": 2.3787182115434635e-05, "loss": 0.1301, "num_input_tokens_seen": 4137096, "step": 21700 }, { "epoch": 11.28118503118503, "grad_norm": 0.1724606454372406, "learning_rate": 2.3775856394419764e-05, "loss": 0.0845, "num_input_tokens_seen": 4138024, "step": 21705 }, { "epoch": 11.283783783783784, "grad_norm": 0.13864119350910187, "learning_rate": 2.37645309252404e-05, "loss": 0.0668, "num_input_tokens_seen": 4139048, "step": 21710 }, { "epoch": 11.286382536382536, "grad_norm": 0.5765827894210815, "learning_rate": 2.375320571022647e-05, "loss": 0.0496, "num_input_tokens_seen": 4139944, "step": 21715 }, { "epoch": 11.288981288981288, "grad_norm": 0.6751233339309692, "learning_rate": 2.374188075170783e-05, "loss": 0.22, "num_input_tokens_seen": 4140872, "step": 21720 }, { "epoch": 11.291580041580042, "grad_norm": 0.147729754447937, "learning_rate": 2.3730556052014314e-05, "loss": 0.1048, "num_input_tokens_seen": 4141864, "step": 21725 }, { "epoch": 11.294178794178794, "grad_norm": 1.5772099494934082, "learning_rate": 2.3719231613475663e-05, "loss": 0.1759, "num_input_tokens_seen": 4142792, "step": 21730 }, { "epoch": 11.296777546777546, "grad_norm": 0.4299401640892029, "learning_rate": 2.3707907438421583e-05, "loss": 0.187, "num_input_tokens_seen": 4143816, "step": 21735 }, { "epoch": 11.299376299376299, "grad_norm": 0.7281152009963989, "learning_rate": 2.369658352918174e-05, "loss": 0.1275, "num_input_tokens_seen": 4144712, "step": 21740 }, { "epoch": 11.301975051975052, "grad_norm": 0.16026119887828827, "learning_rate": 2.3685259888085717e-05, "loss": 0.0808, "num_input_tokens_seen": 4145704, "step": 21745 }, { "epoch": 11.304573804573804, "grad_norm": 1.3362274169921875, "learning_rate": 2.3673936517463075e-05, "loss": 0.1028, "num_input_tokens_seen": 4146632, "step": 21750 }, { "epoch": 11.307172557172557, "grad_norm": 0.3881092369556427, "learning_rate": 2.366261341964328e-05, "loss": 0.1617, "num_input_tokens_seen": 4147624, "step": 21755 }, { "epoch": 11.30977130977131, "grad_norm": 0.3151644468307495, "learning_rate": 2.3651290596955785e-05, "loss": 0.0602, "num_input_tokens_seen": 4148552, "step": 21760 }, { "epoch": 11.312370062370062, "grad_norm": 0.8264963626861572, "learning_rate": 2.363996805172995e-05, "loss": 0.137, "num_input_tokens_seen": 4149480, "step": 21765 }, { "epoch": 11.314968814968815, "grad_norm": 0.43252992630004883, "learning_rate": 2.3628645786295097e-05, "loss": 0.256, "num_input_tokens_seen": 4150504, "step": 21770 }, { "epoch": 11.317567567567568, "grad_norm": 0.09243609756231308, "learning_rate": 2.3617323802980483e-05, "loss": 0.0858, "num_input_tokens_seen": 4151400, "step": 21775 }, { "epoch": 11.32016632016632, "grad_norm": 0.5558660626411438, "learning_rate": 2.360600210411531e-05, "loss": 0.1245, "num_input_tokens_seen": 4152392, "step": 21780 }, { "epoch": 11.322765072765073, "grad_norm": 0.29425257444381714, "learning_rate": 2.3594680692028728e-05, "loss": 0.1434, "num_input_tokens_seen": 4153384, "step": 21785 }, { "epoch": 11.325363825363825, "grad_norm": 0.6490148305892944, "learning_rate": 2.3583359569049822e-05, "loss": 0.1172, "num_input_tokens_seen": 4154280, "step": 21790 }, { "epoch": 11.327962577962579, "grad_norm": 0.09740714728832245, "learning_rate": 2.35720387375076e-05, "loss": 0.137, "num_input_tokens_seen": 4155208, "step": 21795 }, { "epoch": 11.33056133056133, "grad_norm": 0.1856049746274948, "learning_rate": 2.3560718199731045e-05, "loss": 0.1046, "num_input_tokens_seen": 4156136, "step": 21800 }, { "epoch": 11.333160083160083, "grad_norm": 0.48043590784072876, "learning_rate": 2.354939795804905e-05, "loss": 0.0939, "num_input_tokens_seen": 4157128, "step": 21805 }, { "epoch": 11.335758835758837, "grad_norm": 1.3622252941131592, "learning_rate": 2.3538078014790466e-05, "loss": 0.1184, "num_input_tokens_seen": 4158056, "step": 21810 }, { "epoch": 11.338357588357589, "grad_norm": 0.8123403787612915, "learning_rate": 2.3526758372284067e-05, "loss": 0.1533, "num_input_tokens_seen": 4159016, "step": 21815 }, { "epoch": 11.34095634095634, "grad_norm": 0.728013277053833, "learning_rate": 2.3515439032858582e-05, "loss": 0.1453, "num_input_tokens_seen": 4159912, "step": 21820 }, { "epoch": 11.343555093555093, "grad_norm": 0.16499893367290497, "learning_rate": 2.3504119998842664e-05, "loss": 0.0935, "num_input_tokens_seen": 4160840, "step": 21825 }, { "epoch": 11.346153846153847, "grad_norm": 1.0936795473098755, "learning_rate": 2.349280127256491e-05, "loss": 0.2035, "num_input_tokens_seen": 4161800, "step": 21830 }, { "epoch": 11.348752598752599, "grad_norm": 0.727662980556488, "learning_rate": 2.3481482856353846e-05, "loss": 0.1933, "num_input_tokens_seen": 4162792, "step": 21835 }, { "epoch": 11.35135135135135, "grad_norm": 0.63004070520401, "learning_rate": 2.347016475253795e-05, "loss": 0.0634, "num_input_tokens_seen": 4163720, "step": 21840 }, { "epoch": 11.353950103950105, "grad_norm": 0.7735393643379211, "learning_rate": 2.3458846963445617e-05, "loss": 0.0998, "num_input_tokens_seen": 4164648, "step": 21845 }, { "epoch": 11.356548856548857, "grad_norm": 1.446268081665039, "learning_rate": 2.3447529491405197e-05, "loss": 0.1304, "num_input_tokens_seen": 4165640, "step": 21850 }, { "epoch": 11.359147609147609, "grad_norm": 0.3388187885284424, "learning_rate": 2.3436212338744956e-05, "loss": 0.0873, "num_input_tokens_seen": 4166632, "step": 21855 }, { "epoch": 11.361746361746361, "grad_norm": 1.7731244564056396, "learning_rate": 2.3424895507793094e-05, "loss": 0.1805, "num_input_tokens_seen": 4167688, "step": 21860 }, { "epoch": 11.364345114345115, "grad_norm": 0.5730810761451721, "learning_rate": 2.3413579000877765e-05, "loss": 0.1033, "num_input_tokens_seen": 4168648, "step": 21865 }, { "epoch": 11.366943866943867, "grad_norm": 0.1730569452047348, "learning_rate": 2.3402262820327032e-05, "loss": 0.0421, "num_input_tokens_seen": 4169736, "step": 21870 }, { "epoch": 11.369542619542619, "grad_norm": 1.0113794803619385, "learning_rate": 2.3390946968468923e-05, "loss": 0.2551, "num_input_tokens_seen": 4170632, "step": 21875 }, { "epoch": 11.372141372141373, "grad_norm": 0.5659208297729492, "learning_rate": 2.3379631447631356e-05, "loss": 0.0509, "num_input_tokens_seen": 4171496, "step": 21880 }, { "epoch": 11.374740124740125, "grad_norm": 0.11892389506101608, "learning_rate": 2.336831626014222e-05, "loss": 0.2014, "num_input_tokens_seen": 4172520, "step": 21885 }, { "epoch": 11.377338877338877, "grad_norm": 0.41342562437057495, "learning_rate": 2.3357001408329305e-05, "loss": 0.1247, "num_input_tokens_seen": 4173480, "step": 21890 }, { "epoch": 11.37993762993763, "grad_norm": 1.289288878440857, "learning_rate": 2.334568689452036e-05, "loss": 0.2474, "num_input_tokens_seen": 4174440, "step": 21895 }, { "epoch": 11.382536382536383, "grad_norm": 0.1886880248785019, "learning_rate": 2.333437272104304e-05, "loss": 0.1026, "num_input_tokens_seen": 4175272, "step": 21900 }, { "epoch": 11.385135135135135, "grad_norm": 0.7710049748420715, "learning_rate": 2.3323058890224938e-05, "loss": 0.1038, "num_input_tokens_seen": 4176264, "step": 21905 }, { "epoch": 11.387733887733887, "grad_norm": 0.5443367958068848, "learning_rate": 2.3311745404393592e-05, "loss": 0.2157, "num_input_tokens_seen": 4177192, "step": 21910 }, { "epoch": 11.390332640332641, "grad_norm": 0.27912190556526184, "learning_rate": 2.330043226587644e-05, "loss": 0.1638, "num_input_tokens_seen": 4178120, "step": 21915 }, { "epoch": 11.392931392931393, "grad_norm": 0.4415137767791748, "learning_rate": 2.3289119477000887e-05, "loss": 0.2043, "num_input_tokens_seen": 4179016, "step": 21920 }, { "epoch": 11.395530145530145, "grad_norm": 1.4026609659194946, "learning_rate": 2.3277807040094218e-05, "loss": 0.145, "num_input_tokens_seen": 4180008, "step": 21925 }, { "epoch": 11.398128898128897, "grad_norm": 0.5545942783355713, "learning_rate": 2.3266494957483677e-05, "loss": 0.0438, "num_input_tokens_seen": 4180968, "step": 21930 }, { "epoch": 11.400727650727651, "grad_norm": 0.38249471783638, "learning_rate": 2.325518323149644e-05, "loss": 0.1548, "num_input_tokens_seen": 4181832, "step": 21935 }, { "epoch": 11.403326403326403, "grad_norm": 0.7355025410652161, "learning_rate": 2.3243871864459583e-05, "loss": 0.0649, "num_input_tokens_seen": 4182728, "step": 21940 }, { "epoch": 11.405925155925155, "grad_norm": 0.725480318069458, "learning_rate": 2.3232560858700137e-05, "loss": 0.0636, "num_input_tokens_seen": 4183720, "step": 21945 }, { "epoch": 11.40852390852391, "grad_norm": 1.5153759717941284, "learning_rate": 2.3221250216545037e-05, "loss": 0.1668, "num_input_tokens_seen": 4184744, "step": 21950 }, { "epoch": 11.411122661122661, "grad_norm": 0.5615174174308777, "learning_rate": 2.320993994032116e-05, "loss": 0.1274, "num_input_tokens_seen": 4185736, "step": 21955 }, { "epoch": 11.413721413721413, "grad_norm": 1.1906436681747437, "learning_rate": 2.3198630032355287e-05, "loss": 0.1636, "num_input_tokens_seen": 4186792, "step": 21960 }, { "epoch": 11.416320166320165, "grad_norm": 0.3876079320907593, "learning_rate": 2.318732049497415e-05, "loss": 0.1394, "num_input_tokens_seen": 4187784, "step": 21965 }, { "epoch": 11.41891891891892, "grad_norm": 0.12769627571105957, "learning_rate": 2.317601133050438e-05, "loss": 0.1124, "num_input_tokens_seen": 4188712, "step": 21970 }, { "epoch": 11.421517671517671, "grad_norm": 0.8760175108909607, "learning_rate": 2.3164702541272547e-05, "loss": 0.2541, "num_input_tokens_seen": 4189704, "step": 21975 }, { "epoch": 11.424116424116423, "grad_norm": 0.6136810779571533, "learning_rate": 2.315339412960513e-05, "loss": 0.1948, "num_input_tokens_seen": 4190696, "step": 21980 }, { "epoch": 11.426715176715177, "grad_norm": 1.7031219005584717, "learning_rate": 2.314208609782856e-05, "loss": 0.1967, "num_input_tokens_seen": 4191656, "step": 21985 }, { "epoch": 11.42931392931393, "grad_norm": 0.6795968413352966, "learning_rate": 2.3130778448269143e-05, "loss": 0.205, "num_input_tokens_seen": 4192744, "step": 21990 }, { "epoch": 11.431912681912682, "grad_norm": 0.9066057205200195, "learning_rate": 2.311947118325314e-05, "loss": 0.2177, "num_input_tokens_seen": 4193672, "step": 21995 }, { "epoch": 11.434511434511435, "grad_norm": 0.7976223230361938, "learning_rate": 2.3108164305106734e-05, "loss": 0.0868, "num_input_tokens_seen": 4194600, "step": 22000 }, { "epoch": 11.437110187110187, "grad_norm": 0.8943383693695068, "learning_rate": 2.309685781615601e-05, "loss": 0.0574, "num_input_tokens_seen": 4195528, "step": 22005 }, { "epoch": 11.43970893970894, "grad_norm": 0.5524896383285522, "learning_rate": 2.3085551718726986e-05, "loss": 0.1455, "num_input_tokens_seen": 4196424, "step": 22010 }, { "epoch": 11.442307692307692, "grad_norm": 1.6228710412979126, "learning_rate": 2.307424601514559e-05, "loss": 0.1086, "num_input_tokens_seen": 4197416, "step": 22015 }, { "epoch": 11.444906444906445, "grad_norm": 1.281663179397583, "learning_rate": 2.3062940707737686e-05, "loss": 0.2595, "num_input_tokens_seen": 4198312, "step": 22020 }, { "epoch": 11.447505197505198, "grad_norm": 0.9762022495269775, "learning_rate": 2.305163579882903e-05, "loss": 0.1944, "num_input_tokens_seen": 4199304, "step": 22025 }, { "epoch": 11.45010395010395, "grad_norm": 0.7892529964447021, "learning_rate": 2.3040331290745325e-05, "loss": 0.0979, "num_input_tokens_seen": 4200200, "step": 22030 }, { "epoch": 11.452702702702704, "grad_norm": 0.2124215066432953, "learning_rate": 2.3029027185812174e-05, "loss": 0.1419, "num_input_tokens_seen": 4201128, "step": 22035 }, { "epoch": 11.455301455301456, "grad_norm": 0.12891441583633423, "learning_rate": 2.3017723486355095e-05, "loss": 0.111, "num_input_tokens_seen": 4202056, "step": 22040 }, { "epoch": 11.457900207900208, "grad_norm": 0.5860406160354614, "learning_rate": 2.3006420194699534e-05, "loss": 0.1816, "num_input_tokens_seen": 4203080, "step": 22045 }, { "epoch": 11.46049896049896, "grad_norm": 0.4077262878417969, "learning_rate": 2.2995117313170855e-05, "loss": 0.2126, "num_input_tokens_seen": 4203944, "step": 22050 }, { "epoch": 11.463097713097714, "grad_norm": 0.18588438630104065, "learning_rate": 2.2983814844094312e-05, "loss": 0.1433, "num_input_tokens_seen": 4204840, "step": 22055 }, { "epoch": 11.465696465696466, "grad_norm": 0.5578171014785767, "learning_rate": 2.2972512789795103e-05, "loss": 0.1342, "num_input_tokens_seen": 4205832, "step": 22060 }, { "epoch": 11.468295218295218, "grad_norm": 0.2530505359172821, "learning_rate": 2.2961211152598325e-05, "loss": 0.0671, "num_input_tokens_seen": 4206792, "step": 22065 }, { "epoch": 11.470893970893972, "grad_norm": 0.3435325026512146, "learning_rate": 2.2949909934829e-05, "loss": 0.1404, "num_input_tokens_seen": 4207720, "step": 22070 }, { "epoch": 11.473492723492724, "grad_norm": 0.31869885325431824, "learning_rate": 2.2938609138812053e-05, "loss": 0.1041, "num_input_tokens_seen": 4208680, "step": 22075 }, { "epoch": 11.476091476091476, "grad_norm": 0.4741724133491516, "learning_rate": 2.2927308766872333e-05, "loss": 0.1323, "num_input_tokens_seen": 4209576, "step": 22080 }, { "epoch": 11.478690228690228, "grad_norm": 0.9160931706428528, "learning_rate": 2.2916008821334585e-05, "loss": 0.1412, "num_input_tokens_seen": 4210536, "step": 22085 }, { "epoch": 11.481288981288982, "grad_norm": 0.49974188208580017, "learning_rate": 2.2904709304523492e-05, "loss": 0.1542, "num_input_tokens_seen": 4211496, "step": 22090 }, { "epoch": 11.483887733887734, "grad_norm": 0.1388823539018631, "learning_rate": 2.2893410218763617e-05, "loss": 0.0886, "num_input_tokens_seen": 4212424, "step": 22095 }, { "epoch": 11.486486486486486, "grad_norm": 0.49979299306869507, "learning_rate": 2.2882111566379465e-05, "loss": 0.1556, "num_input_tokens_seen": 4213384, "step": 22100 }, { "epoch": 11.48908523908524, "grad_norm": 0.8988315463066101, "learning_rate": 2.287081334969543e-05, "loss": 0.0956, "num_input_tokens_seen": 4214280, "step": 22105 }, { "epoch": 11.491683991683992, "grad_norm": 0.34862279891967773, "learning_rate": 2.2859515571035835e-05, "loss": 0.037, "num_input_tokens_seen": 4215208, "step": 22110 }, { "epoch": 11.494282744282744, "grad_norm": 0.6281953454017639, "learning_rate": 2.2848218232724896e-05, "loss": 0.0892, "num_input_tokens_seen": 4216200, "step": 22115 }, { "epoch": 11.496881496881496, "grad_norm": 0.43378642201423645, "learning_rate": 2.2836921337086735e-05, "loss": 0.2365, "num_input_tokens_seen": 4217224, "step": 22120 }, { "epoch": 11.49948024948025, "grad_norm": 1.029105544090271, "learning_rate": 2.2825624886445407e-05, "loss": 0.1179, "num_input_tokens_seen": 4218184, "step": 22125 }, { "epoch": 11.502079002079002, "grad_norm": 0.1942366659641266, "learning_rate": 2.2814328883124848e-05, "loss": 0.0213, "num_input_tokens_seen": 4219048, "step": 22130 }, { "epoch": 11.504677754677754, "grad_norm": 0.4860829710960388, "learning_rate": 2.2803033329448927e-05, "loss": 0.0543, "num_input_tokens_seen": 4219976, "step": 22135 }, { "epoch": 11.507276507276508, "grad_norm": 0.11099112778902054, "learning_rate": 2.27917382277414e-05, "loss": 0.1805, "num_input_tokens_seen": 4220872, "step": 22140 }, { "epoch": 11.50987525987526, "grad_norm": 0.16986209154129028, "learning_rate": 2.2780443580325946e-05, "loss": 0.0674, "num_input_tokens_seen": 4221864, "step": 22145 }, { "epoch": 11.512474012474012, "grad_norm": 0.5606840252876282, "learning_rate": 2.276914938952614e-05, "loss": 0.0763, "num_input_tokens_seen": 4222792, "step": 22150 }, { "epoch": 11.515072765072766, "grad_norm": 1.2760597467422485, "learning_rate": 2.275785565766546e-05, "loss": 0.1866, "num_input_tokens_seen": 4223752, "step": 22155 }, { "epoch": 11.517671517671518, "grad_norm": 1.2966500520706177, "learning_rate": 2.2746562387067304e-05, "loss": 0.1014, "num_input_tokens_seen": 4224712, "step": 22160 }, { "epoch": 11.52027027027027, "grad_norm": 0.49453532695770264, "learning_rate": 2.2735269580054967e-05, "loss": 0.1264, "num_input_tokens_seen": 4225736, "step": 22165 }, { "epoch": 11.522869022869022, "grad_norm": 0.20802530646324158, "learning_rate": 2.2723977238951645e-05, "loss": 0.088, "num_input_tokens_seen": 4226696, "step": 22170 }, { "epoch": 11.525467775467776, "grad_norm": 1.6397110223770142, "learning_rate": 2.271268536608044e-05, "loss": 0.1316, "num_input_tokens_seen": 4227656, "step": 22175 }, { "epoch": 11.528066528066528, "grad_norm": 0.2579314708709717, "learning_rate": 2.2701393963764374e-05, "loss": 0.1576, "num_input_tokens_seen": 4228616, "step": 22180 }, { "epoch": 11.53066528066528, "grad_norm": 0.6272527575492859, "learning_rate": 2.2690103034326335e-05, "loss": 0.1249, "num_input_tokens_seen": 4229576, "step": 22185 }, { "epoch": 11.533264033264032, "grad_norm": 0.2737540304660797, "learning_rate": 2.267881258008914e-05, "loss": 0.1852, "num_input_tokens_seen": 4230536, "step": 22190 }, { "epoch": 11.535862785862786, "grad_norm": 1.0414319038391113, "learning_rate": 2.2667522603375524e-05, "loss": 0.1286, "num_input_tokens_seen": 4231400, "step": 22195 }, { "epoch": 11.538461538461538, "grad_norm": 0.5962792038917542, "learning_rate": 2.2656233106508083e-05, "loss": 0.0924, "num_input_tokens_seen": 4232328, "step": 22200 }, { "epoch": 11.54106029106029, "grad_norm": 0.399443119764328, "learning_rate": 2.2644944091809347e-05, "loss": 0.072, "num_input_tokens_seen": 4233256, "step": 22205 }, { "epoch": 11.543659043659044, "grad_norm": 0.3309454023838043, "learning_rate": 2.263365556160173e-05, "loss": 0.1867, "num_input_tokens_seen": 4234248, "step": 22210 }, { "epoch": 11.546257796257796, "grad_norm": 0.11855466663837433, "learning_rate": 2.2622367518207556e-05, "loss": 0.1059, "num_input_tokens_seen": 4235240, "step": 22215 }, { "epoch": 11.548856548856548, "grad_norm": 0.7043957710266113, "learning_rate": 2.261107996394904e-05, "loss": 0.075, "num_input_tokens_seen": 4236168, "step": 22220 }, { "epoch": 11.551455301455302, "grad_norm": 0.3296055197715759, "learning_rate": 2.2599792901148308e-05, "loss": 0.0567, "num_input_tokens_seen": 4237160, "step": 22225 }, { "epoch": 11.554054054054054, "grad_norm": 0.2840867042541504, "learning_rate": 2.2588506332127367e-05, "loss": 0.1921, "num_input_tokens_seen": 4238120, "step": 22230 }, { "epoch": 11.556652806652806, "grad_norm": 0.9573729038238525, "learning_rate": 2.2577220259208144e-05, "loss": 0.2264, "num_input_tokens_seen": 4239144, "step": 22235 }, { "epoch": 11.559251559251559, "grad_norm": 0.8979879021644592, "learning_rate": 2.2565934684712446e-05, "loss": 0.2371, "num_input_tokens_seen": 4240104, "step": 22240 }, { "epoch": 11.561850311850312, "grad_norm": 0.700228750705719, "learning_rate": 2.2554649610961995e-05, "loss": 0.2323, "num_input_tokens_seen": 4241064, "step": 22245 }, { "epoch": 11.564449064449065, "grad_norm": 1.2342450618743896, "learning_rate": 2.2543365040278388e-05, "loss": 0.1817, "num_input_tokens_seen": 4242120, "step": 22250 }, { "epoch": 11.567047817047817, "grad_norm": 0.7641770839691162, "learning_rate": 2.2532080974983128e-05, "loss": 0.3298, "num_input_tokens_seen": 4243112, "step": 22255 }, { "epoch": 11.56964656964657, "grad_norm": 0.32206353545188904, "learning_rate": 2.2520797417397628e-05, "loss": 0.1541, "num_input_tokens_seen": 4244072, "step": 22260 }, { "epoch": 11.572245322245323, "grad_norm": 0.9093949198722839, "learning_rate": 2.250951436984317e-05, "loss": 0.2024, "num_input_tokens_seen": 4245064, "step": 22265 }, { "epoch": 11.574844074844075, "grad_norm": 1.4027349948883057, "learning_rate": 2.2498231834640963e-05, "loss": 0.224, "num_input_tokens_seen": 4245992, "step": 22270 }, { "epoch": 11.577442827442827, "grad_norm": 0.8196515440940857, "learning_rate": 2.2486949814112077e-05, "loss": 0.2116, "num_input_tokens_seen": 4247016, "step": 22275 }, { "epoch": 11.58004158004158, "grad_norm": 0.9910497069358826, "learning_rate": 2.24756683105775e-05, "loss": 0.1342, "num_input_tokens_seen": 4248008, "step": 22280 }, { "epoch": 11.582640332640333, "grad_norm": 0.16255483031272888, "learning_rate": 2.2464387326358114e-05, "loss": 0.1514, "num_input_tokens_seen": 4248904, "step": 22285 }, { "epoch": 11.585239085239085, "grad_norm": 0.9445201754570007, "learning_rate": 2.245310686377467e-05, "loss": 0.0679, "num_input_tokens_seen": 4249864, "step": 22290 }, { "epoch": 11.587837837837839, "grad_norm": 1.6976319551467896, "learning_rate": 2.2441826925147844e-05, "loss": 0.186, "num_input_tokens_seen": 4250856, "step": 22295 }, { "epoch": 11.59043659043659, "grad_norm": 0.3268772065639496, "learning_rate": 2.2430547512798175e-05, "loss": 0.1063, "num_input_tokens_seen": 4251848, "step": 22300 }, { "epoch": 11.593035343035343, "grad_norm": 1.0207716226577759, "learning_rate": 2.241926862904612e-05, "loss": 0.0827, "num_input_tokens_seen": 4252776, "step": 22305 }, { "epoch": 11.595634095634095, "grad_norm": 0.12111031264066696, "learning_rate": 2.240799027621201e-05, "loss": 0.1434, "num_input_tokens_seen": 4253736, "step": 22310 }, { "epoch": 11.598232848232849, "grad_norm": 0.12195735424757004, "learning_rate": 2.2396712456616056e-05, "loss": 0.0945, "num_input_tokens_seen": 4254600, "step": 22315 }, { "epoch": 11.6008316008316, "grad_norm": 0.9877561330795288, "learning_rate": 2.2385435172578392e-05, "loss": 0.1615, "num_input_tokens_seen": 4255560, "step": 22320 }, { "epoch": 11.603430353430353, "grad_norm": 1.0536065101623535, "learning_rate": 2.2374158426419013e-05, "loss": 0.0933, "num_input_tokens_seen": 4256552, "step": 22325 }, { "epoch": 11.606029106029107, "grad_norm": 0.09741493314504623, "learning_rate": 2.2362882220457827e-05, "loss": 0.0867, "num_input_tokens_seen": 4257544, "step": 22330 }, { "epoch": 11.608627858627859, "grad_norm": 0.9937560558319092, "learning_rate": 2.23516065570146e-05, "loss": 0.0642, "num_input_tokens_seen": 4258504, "step": 22335 }, { "epoch": 11.611226611226611, "grad_norm": 1.4592639207839966, "learning_rate": 2.2340331438409026e-05, "loss": 0.1214, "num_input_tokens_seen": 4259496, "step": 22340 }, { "epoch": 11.613825363825363, "grad_norm": 2.2067384719848633, "learning_rate": 2.2329056866960647e-05, "loss": 0.144, "num_input_tokens_seen": 4260424, "step": 22345 }, { "epoch": 11.616424116424117, "grad_norm": 0.5727750062942505, "learning_rate": 2.2317782844988924e-05, "loss": 0.1518, "num_input_tokens_seen": 4261416, "step": 22350 }, { "epoch": 11.619022869022869, "grad_norm": 0.42759937047958374, "learning_rate": 2.2306509374813182e-05, "loss": 0.0647, "num_input_tokens_seen": 4262312, "step": 22355 }, { "epoch": 11.621621621621621, "grad_norm": 0.6598073840141296, "learning_rate": 2.2295236458752656e-05, "loss": 0.1395, "num_input_tokens_seen": 4263240, "step": 22360 }, { "epoch": 11.624220374220375, "grad_norm": 0.39612171053886414, "learning_rate": 2.2283964099126437e-05, "loss": 0.1322, "num_input_tokens_seen": 4264104, "step": 22365 }, { "epoch": 11.626819126819127, "grad_norm": 0.4922510087490082, "learning_rate": 2.2272692298253527e-05, "loss": 0.0404, "num_input_tokens_seen": 4265032, "step": 22370 }, { "epoch": 11.629417879417879, "grad_norm": 1.0438882112503052, "learning_rate": 2.2261421058452813e-05, "loss": 0.2019, "num_input_tokens_seen": 4265928, "step": 22375 }, { "epoch": 11.632016632016633, "grad_norm": 1.025863766670227, "learning_rate": 2.2250150382043036e-05, "loss": 0.1612, "num_input_tokens_seen": 4266920, "step": 22380 }, { "epoch": 11.634615384615385, "grad_norm": 0.7061724662780762, "learning_rate": 2.223888027134286e-05, "loss": 0.2194, "num_input_tokens_seen": 4267912, "step": 22385 }, { "epoch": 11.637214137214137, "grad_norm": 0.9217050075531006, "learning_rate": 2.2227610728670807e-05, "loss": 0.1714, "num_input_tokens_seen": 4268872, "step": 22390 }, { "epoch": 11.63981288981289, "grad_norm": 0.6836273074150085, "learning_rate": 2.2216341756345295e-05, "loss": 0.3105, "num_input_tokens_seen": 4269896, "step": 22395 }, { "epoch": 11.642411642411643, "grad_norm": 0.21015401184558868, "learning_rate": 2.220507335668461e-05, "loss": 0.1075, "num_input_tokens_seen": 4270792, "step": 22400 }, { "epoch": 11.645010395010395, "grad_norm": 0.14727629721164703, "learning_rate": 2.219380553200694e-05, "loss": 0.0824, "num_input_tokens_seen": 4271784, "step": 22405 }, { "epoch": 11.647609147609147, "grad_norm": 0.1493186205625534, "learning_rate": 2.2182538284630352e-05, "loss": 0.0983, "num_input_tokens_seen": 4272680, "step": 22410 }, { "epoch": 11.6502079002079, "grad_norm": 0.8817319869995117, "learning_rate": 2.2171271616872774e-05, "loss": 0.1537, "num_input_tokens_seen": 4273608, "step": 22415 }, { "epoch": 11.652806652806653, "grad_norm": 0.058714594691991806, "learning_rate": 2.2160005531052034e-05, "loss": 0.0875, "num_input_tokens_seen": 4274472, "step": 22420 }, { "epoch": 11.655405405405405, "grad_norm": 0.573270857334137, "learning_rate": 2.2148740029485833e-05, "loss": 0.0748, "num_input_tokens_seen": 4275464, "step": 22425 }, { "epoch": 11.658004158004157, "grad_norm": 0.9866743683815002, "learning_rate": 2.2137475114491754e-05, "loss": 0.1407, "num_input_tokens_seen": 4276392, "step": 22430 }, { "epoch": 11.660602910602911, "grad_norm": 0.25529971718788147, "learning_rate": 2.2126210788387255e-05, "loss": 0.0913, "num_input_tokens_seen": 4277384, "step": 22435 }, { "epoch": 11.663201663201663, "grad_norm": 0.9202139973640442, "learning_rate": 2.2114947053489686e-05, "loss": 0.1781, "num_input_tokens_seen": 4278376, "step": 22440 }, { "epoch": 11.665800415800415, "grad_norm": 0.36147767305374146, "learning_rate": 2.2103683912116262e-05, "loss": 0.0868, "num_input_tokens_seen": 4279272, "step": 22445 }, { "epoch": 11.66839916839917, "grad_norm": 0.8634151220321655, "learning_rate": 2.2092421366584065e-05, "loss": 0.2189, "num_input_tokens_seen": 4280168, "step": 22450 }, { "epoch": 11.670997920997921, "grad_norm": 0.06880289316177368, "learning_rate": 2.2081159419210086e-05, "loss": 0.0696, "num_input_tokens_seen": 4281192, "step": 22455 }, { "epoch": 11.673596673596673, "grad_norm": 1.2637865543365479, "learning_rate": 2.2069898072311167e-05, "loss": 0.1347, "num_input_tokens_seen": 4282120, "step": 22460 }, { "epoch": 11.676195426195425, "grad_norm": 1.1887917518615723, "learning_rate": 2.205863732820404e-05, "loss": 0.094, "num_input_tokens_seen": 4283144, "step": 22465 }, { "epoch": 11.67879417879418, "grad_norm": 0.3162832260131836, "learning_rate": 2.2047377189205302e-05, "loss": 0.0997, "num_input_tokens_seen": 4284040, "step": 22470 }, { "epoch": 11.681392931392931, "grad_norm": 1.6898540258407593, "learning_rate": 2.203611765763145e-05, "loss": 0.1007, "num_input_tokens_seen": 4284936, "step": 22475 }, { "epoch": 11.683991683991684, "grad_norm": 0.3418751060962677, "learning_rate": 2.2024858735798814e-05, "loss": 0.0808, "num_input_tokens_seen": 4285960, "step": 22480 }, { "epoch": 11.686590436590437, "grad_norm": 0.45291373133659363, "learning_rate": 2.201360042602364e-05, "loss": 0.31, "num_input_tokens_seen": 4286920, "step": 22485 }, { "epoch": 11.68918918918919, "grad_norm": 0.46634939312934875, "learning_rate": 2.2002342730622017e-05, "loss": 0.2369, "num_input_tokens_seen": 4287848, "step": 22490 }, { "epoch": 11.691787941787942, "grad_norm": 0.13923104107379913, "learning_rate": 2.1991085651909935e-05, "loss": 0.1119, "num_input_tokens_seen": 4288776, "step": 22495 }, { "epoch": 11.694386694386694, "grad_norm": 0.1579054296016693, "learning_rate": 2.1979829192203233e-05, "loss": 0.1196, "num_input_tokens_seen": 4289736, "step": 22500 }, { "epoch": 11.696985446985448, "grad_norm": 0.7732113003730774, "learning_rate": 2.196857335381764e-05, "loss": 0.0864, "num_input_tokens_seen": 4290664, "step": 22505 }, { "epoch": 11.6995841995842, "grad_norm": 0.4376024603843689, "learning_rate": 2.1957318139068752e-05, "loss": 0.2202, "num_input_tokens_seen": 4291592, "step": 22510 }, { "epoch": 11.702182952182952, "grad_norm": 0.4823603332042694, "learning_rate": 2.194606355027202e-05, "loss": 0.1357, "num_input_tokens_seen": 4292616, "step": 22515 }, { "epoch": 11.704781704781706, "grad_norm": 0.5705369114875793, "learning_rate": 2.1934809589742797e-05, "loss": 0.0691, "num_input_tokens_seen": 4293544, "step": 22520 }, { "epoch": 11.707380457380458, "grad_norm": 0.04352056607604027, "learning_rate": 2.1923556259796278e-05, "loss": 0.0859, "num_input_tokens_seen": 4294536, "step": 22525 }, { "epoch": 11.70997920997921, "grad_norm": 0.28736940026283264, "learning_rate": 2.191230356274755e-05, "loss": 0.1039, "num_input_tokens_seen": 4295528, "step": 22530 }, { "epoch": 11.712577962577962, "grad_norm": 1.458091378211975, "learning_rate": 2.1901051500911563e-05, "loss": 0.171, "num_input_tokens_seen": 4296520, "step": 22535 }, { "epoch": 11.715176715176716, "grad_norm": 0.40763160586357117, "learning_rate": 2.1889800076603124e-05, "loss": 0.136, "num_input_tokens_seen": 4297416, "step": 22540 }, { "epoch": 11.717775467775468, "grad_norm": 0.3054076135158539, "learning_rate": 2.1878549292136928e-05, "loss": 0.0886, "num_input_tokens_seen": 4298376, "step": 22545 }, { "epoch": 11.72037422037422, "grad_norm": 0.34785881638526917, "learning_rate": 2.186729914982753e-05, "loss": 0.0845, "num_input_tokens_seen": 4299368, "step": 22550 }, { "epoch": 11.722972972972974, "grad_norm": 0.7690040469169617, "learning_rate": 2.1856049651989347e-05, "loss": 0.1233, "num_input_tokens_seen": 4300264, "step": 22555 }, { "epoch": 11.725571725571726, "grad_norm": 0.7560325860977173, "learning_rate": 2.1844800800936667e-05, "loss": 0.0504, "num_input_tokens_seen": 4301160, "step": 22560 }, { "epoch": 11.728170478170478, "grad_norm": 0.10629364848136902, "learning_rate": 2.1833552598983658e-05, "loss": 0.0445, "num_input_tokens_seen": 4302152, "step": 22565 }, { "epoch": 11.73076923076923, "grad_norm": 0.03796372190117836, "learning_rate": 2.1822305048444328e-05, "loss": 0.1726, "num_input_tokens_seen": 4303144, "step": 22570 }, { "epoch": 11.733367983367984, "grad_norm": 0.5300215482711792, "learning_rate": 2.1811058151632586e-05, "loss": 0.1035, "num_input_tokens_seen": 4304168, "step": 22575 }, { "epoch": 11.735966735966736, "grad_norm": 1.1978394985198975, "learning_rate": 2.1799811910862166e-05, "loss": 0.2451, "num_input_tokens_seen": 4305192, "step": 22580 }, { "epoch": 11.738565488565488, "grad_norm": 0.26813891530036926, "learning_rate": 2.1788566328446695e-05, "loss": 0.1274, "num_input_tokens_seen": 4306184, "step": 22585 }, { "epoch": 11.741164241164242, "grad_norm": 0.056397829204797745, "learning_rate": 2.1777321406699664e-05, "loss": 0.0823, "num_input_tokens_seen": 4307144, "step": 22590 }, { "epoch": 11.743762993762994, "grad_norm": 0.3460753858089447, "learning_rate": 2.1766077147934408e-05, "loss": 0.05, "num_input_tokens_seen": 4308040, "step": 22595 }, { "epoch": 11.746361746361746, "grad_norm": 0.13595589995384216, "learning_rate": 2.1754833554464153e-05, "loss": 0.1791, "num_input_tokens_seen": 4309000, "step": 22600 }, { "epoch": 11.7489604989605, "grad_norm": 0.25602859258651733, "learning_rate": 2.1743590628601963e-05, "loss": 0.1607, "num_input_tokens_seen": 4309992, "step": 22605 }, { "epoch": 11.751559251559252, "grad_norm": 0.12242985516786575, "learning_rate": 2.1732348372660785e-05, "loss": 0.1764, "num_input_tokens_seen": 4310952, "step": 22610 }, { "epoch": 11.754158004158004, "grad_norm": 0.46553149819374084, "learning_rate": 2.172110678895341e-05, "loss": 0.1872, "num_input_tokens_seen": 4311880, "step": 22615 }, { "epoch": 11.756756756756756, "grad_norm": 0.16196465492248535, "learning_rate": 2.1709865879792514e-05, "loss": 0.149, "num_input_tokens_seen": 4312840, "step": 22620 }, { "epoch": 11.75935550935551, "grad_norm": 0.4475483000278473, "learning_rate": 2.16986256474906e-05, "loss": 0.0861, "num_input_tokens_seen": 4313832, "step": 22625 }, { "epoch": 11.761954261954262, "grad_norm": 0.39966753125190735, "learning_rate": 2.1687386094360067e-05, "loss": 0.1608, "num_input_tokens_seen": 4314792, "step": 22630 }, { "epoch": 11.764553014553014, "grad_norm": 0.16944916546344757, "learning_rate": 2.167614722271315e-05, "loss": 0.1103, "num_input_tokens_seen": 4315656, "step": 22635 }, { "epoch": 11.767151767151766, "grad_norm": 1.0795778036117554, "learning_rate": 2.166490903486197e-05, "loss": 0.1135, "num_input_tokens_seen": 4316616, "step": 22640 }, { "epoch": 11.76975051975052, "grad_norm": 0.46517741680145264, "learning_rate": 2.1653671533118468e-05, "loss": 0.1696, "num_input_tokens_seen": 4317640, "step": 22645 }, { "epoch": 11.772349272349272, "grad_norm": 0.40839776396751404, "learning_rate": 2.164243471979447e-05, "loss": 0.1366, "num_input_tokens_seen": 4318536, "step": 22650 }, { "epoch": 11.774948024948024, "grad_norm": 0.808806836605072, "learning_rate": 2.1631198597201662e-05, "loss": 0.1071, "num_input_tokens_seen": 4319464, "step": 22655 }, { "epoch": 11.777546777546778, "grad_norm": 0.5009179711341858, "learning_rate": 2.1619963167651586e-05, "loss": 0.0893, "num_input_tokens_seen": 4320424, "step": 22660 }, { "epoch": 11.78014553014553, "grad_norm": 1.3064030408859253, "learning_rate": 2.1608728433455625e-05, "loss": 0.2047, "num_input_tokens_seen": 4321352, "step": 22665 }, { "epoch": 11.782744282744282, "grad_norm": 1.006243348121643, "learning_rate": 2.1597494396925048e-05, "loss": 0.1577, "num_input_tokens_seen": 4322280, "step": 22670 }, { "epoch": 11.785343035343036, "grad_norm": 0.640764594078064, "learning_rate": 2.1586261060370945e-05, "loss": 0.1302, "num_input_tokens_seen": 4323176, "step": 22675 }, { "epoch": 11.787941787941788, "grad_norm": 0.3556065857410431, "learning_rate": 2.15750284261043e-05, "loss": 0.0699, "num_input_tokens_seen": 4324200, "step": 22680 }, { "epoch": 11.79054054054054, "grad_norm": 0.5109695792198181, "learning_rate": 2.156379649643592e-05, "loss": 0.2071, "num_input_tokens_seen": 4325128, "step": 22685 }, { "epoch": 11.793139293139292, "grad_norm": 0.13297437131404877, "learning_rate": 2.155256527367649e-05, "loss": 0.0491, "num_input_tokens_seen": 4326024, "step": 22690 }, { "epoch": 11.795738045738046, "grad_norm": 1.633135437965393, "learning_rate": 2.1541334760136528e-05, "loss": 0.1839, "num_input_tokens_seen": 4326920, "step": 22695 }, { "epoch": 11.798336798336798, "grad_norm": 0.18445082008838654, "learning_rate": 2.1530104958126434e-05, "loss": 0.1157, "num_input_tokens_seen": 4327944, "step": 22700 }, { "epoch": 11.80093555093555, "grad_norm": 0.5725605487823486, "learning_rate": 2.151887586995644e-05, "loss": 0.1886, "num_input_tokens_seen": 4328840, "step": 22705 }, { "epoch": 11.803534303534304, "grad_norm": 0.877704918384552, "learning_rate": 2.1507647497936625e-05, "loss": 0.2173, "num_input_tokens_seen": 4329800, "step": 22710 }, { "epoch": 11.806133056133056, "grad_norm": 0.3701685667037964, "learning_rate": 2.149641984437695e-05, "loss": 0.1065, "num_input_tokens_seen": 4330696, "step": 22715 }, { "epoch": 11.808731808731808, "grad_norm": 0.33445999026298523, "learning_rate": 2.1485192911587195e-05, "loss": 0.139, "num_input_tokens_seen": 4331624, "step": 22720 }, { "epoch": 11.81133056133056, "grad_norm": 0.14037717878818512, "learning_rate": 2.1473966701877025e-05, "loss": 0.1218, "num_input_tokens_seen": 4332520, "step": 22725 }, { "epoch": 11.813929313929314, "grad_norm": 0.5877376794815063, "learning_rate": 2.146274121755592e-05, "loss": 0.1343, "num_input_tokens_seen": 4333480, "step": 22730 }, { "epoch": 11.816528066528067, "grad_norm": 1.0443090200424194, "learning_rate": 2.1451516460933246e-05, "loss": 0.1254, "num_input_tokens_seen": 4334440, "step": 22735 }, { "epoch": 11.819126819126819, "grad_norm": 0.5145530104637146, "learning_rate": 2.144029243431819e-05, "loss": 0.0953, "num_input_tokens_seen": 4335336, "step": 22740 }, { "epoch": 11.821725571725572, "grad_norm": 1.1380550861358643, "learning_rate": 2.1429069140019814e-05, "loss": 0.1307, "num_input_tokens_seen": 4336296, "step": 22745 }, { "epoch": 11.824324324324325, "grad_norm": 0.24574580788612366, "learning_rate": 2.1417846580347005e-05, "loss": 0.2049, "num_input_tokens_seen": 4337256, "step": 22750 }, { "epoch": 11.826923076923077, "grad_norm": 1.2013384103775024, "learning_rate": 2.1406624757608526e-05, "loss": 0.0506, "num_input_tokens_seen": 4338152, "step": 22755 }, { "epoch": 11.829521829521829, "grad_norm": 1.1106266975402832, "learning_rate": 2.1395403674112957e-05, "loss": 0.0947, "num_input_tokens_seen": 4339016, "step": 22760 }, { "epoch": 11.832120582120583, "grad_norm": 0.16878236830234528, "learning_rate": 2.138418333216875e-05, "loss": 0.196, "num_input_tokens_seen": 4339976, "step": 22765 }, { "epoch": 11.834719334719335, "grad_norm": 1.0321077108383179, "learning_rate": 2.1372963734084213e-05, "loss": 0.2088, "num_input_tokens_seen": 4340872, "step": 22770 }, { "epoch": 11.837318087318087, "grad_norm": 0.3048718273639679, "learning_rate": 2.1361744882167446e-05, "loss": 0.0979, "num_input_tokens_seen": 4341864, "step": 22775 }, { "epoch": 11.83991683991684, "grad_norm": 0.20144307613372803, "learning_rate": 2.135052677872646e-05, "loss": 0.1888, "num_input_tokens_seen": 4342824, "step": 22780 }, { "epoch": 11.842515592515593, "grad_norm": 0.3494527041912079, "learning_rate": 2.1339309426069094e-05, "loss": 0.0681, "num_input_tokens_seen": 4343752, "step": 22785 }, { "epoch": 11.845114345114345, "grad_norm": 1.1145297288894653, "learning_rate": 2.1328092826502998e-05, "loss": 0.0985, "num_input_tokens_seen": 4344648, "step": 22790 }, { "epoch": 11.847713097713097, "grad_norm": 0.7908114194869995, "learning_rate": 2.131687698233572e-05, "loss": 0.1189, "num_input_tokens_seen": 4345544, "step": 22795 }, { "epoch": 11.85031185031185, "grad_norm": 0.06083479896187782, "learning_rate": 2.130566189587461e-05, "loss": 0.0725, "num_input_tokens_seen": 4346568, "step": 22800 }, { "epoch": 11.852910602910603, "grad_norm": 1.1568011045455933, "learning_rate": 2.129444756942689e-05, "loss": 0.1152, "num_input_tokens_seen": 4347560, "step": 22805 }, { "epoch": 11.855509355509355, "grad_norm": 0.5477353930473328, "learning_rate": 2.12832340052996e-05, "loss": 0.0648, "num_input_tokens_seen": 4348488, "step": 22810 }, { "epoch": 11.858108108108109, "grad_norm": 0.29120156168937683, "learning_rate": 2.127202120579965e-05, "loss": 0.0604, "num_input_tokens_seen": 4349384, "step": 22815 }, { "epoch": 11.86070686070686, "grad_norm": 0.17020417749881744, "learning_rate": 2.1260809173233775e-05, "loss": 0.1349, "num_input_tokens_seen": 4350280, "step": 22820 }, { "epoch": 11.863305613305613, "grad_norm": 0.8745192885398865, "learning_rate": 2.124959790990856e-05, "loss": 0.1717, "num_input_tokens_seen": 4351240, "step": 22825 }, { "epoch": 11.865904365904367, "grad_norm": 0.576145350933075, "learning_rate": 2.1238387418130422e-05, "loss": 0.1818, "num_input_tokens_seen": 4352200, "step": 22830 }, { "epoch": 11.868503118503119, "grad_norm": 0.15181350708007812, "learning_rate": 2.122717770020564e-05, "loss": 0.1509, "num_input_tokens_seen": 4353192, "step": 22835 }, { "epoch": 11.871101871101871, "grad_norm": 0.0769166424870491, "learning_rate": 2.121596875844031e-05, "loss": 0.1292, "num_input_tokens_seen": 4354184, "step": 22840 }, { "epoch": 11.873700623700623, "grad_norm": 0.6498090028762817, "learning_rate": 2.1204760595140373e-05, "loss": 0.1311, "num_input_tokens_seen": 4355144, "step": 22845 }, { "epoch": 11.876299376299377, "grad_norm": 0.1830112636089325, "learning_rate": 2.1193553212611627e-05, "loss": 0.1383, "num_input_tokens_seen": 4356072, "step": 22850 }, { "epoch": 11.878898128898129, "grad_norm": 1.1668727397918701, "learning_rate": 2.1182346613159688e-05, "loss": 0.1775, "num_input_tokens_seen": 4357032, "step": 22855 }, { "epoch": 11.881496881496881, "grad_norm": 0.6648356318473816, "learning_rate": 2.117114079909003e-05, "loss": 0.1487, "num_input_tokens_seen": 4358024, "step": 22860 }, { "epoch": 11.884095634095633, "grad_norm": 0.2146187275648117, "learning_rate": 2.1159935772707944e-05, "loss": 0.0439, "num_input_tokens_seen": 4358920, "step": 22865 }, { "epoch": 11.886694386694387, "grad_norm": 0.21170182526111603, "learning_rate": 2.1148731536318587e-05, "loss": 0.0726, "num_input_tokens_seen": 4359944, "step": 22870 }, { "epoch": 11.88929313929314, "grad_norm": 0.6813265681266785, "learning_rate": 2.1137528092226918e-05, "loss": 0.1233, "num_input_tokens_seen": 4360904, "step": 22875 }, { "epoch": 11.891891891891891, "grad_norm": 0.5339928865432739, "learning_rate": 2.1126325442737775e-05, "loss": 0.1573, "num_input_tokens_seen": 4361928, "step": 22880 }, { "epoch": 11.894490644490645, "grad_norm": 0.7848577499389648, "learning_rate": 2.111512359015579e-05, "loss": 0.1163, "num_input_tokens_seen": 4362856, "step": 22885 }, { "epoch": 11.897089397089397, "grad_norm": 0.9644594788551331, "learning_rate": 2.1103922536785456e-05, "loss": 0.0785, "num_input_tokens_seen": 4363816, "step": 22890 }, { "epoch": 11.89968814968815, "grad_norm": 0.2001425176858902, "learning_rate": 2.1092722284931108e-05, "loss": 0.0447, "num_input_tokens_seen": 4364712, "step": 22895 }, { "epoch": 11.902286902286903, "grad_norm": 0.7123824954032898, "learning_rate": 2.10815228368969e-05, "loss": 0.1408, "num_input_tokens_seen": 4365704, "step": 22900 }, { "epoch": 11.904885654885655, "grad_norm": 0.11622052639722824, "learning_rate": 2.107032419498681e-05, "loss": 0.1333, "num_input_tokens_seen": 4366696, "step": 22905 }, { "epoch": 11.907484407484407, "grad_norm": 0.7783423066139221, "learning_rate": 2.1059126361504684e-05, "loss": 0.098, "num_input_tokens_seen": 4367688, "step": 22910 }, { "epoch": 11.91008316008316, "grad_norm": 0.3858347237110138, "learning_rate": 2.1047929338754173e-05, "loss": 0.1624, "num_input_tokens_seen": 4368584, "step": 22915 }, { "epoch": 11.912681912681913, "grad_norm": 0.21762266755104065, "learning_rate": 2.103673312903878e-05, "loss": 0.1528, "num_input_tokens_seen": 4369512, "step": 22920 }, { "epoch": 11.915280665280665, "grad_norm": 0.5436348915100098, "learning_rate": 2.1025537734661818e-05, "loss": 0.1321, "num_input_tokens_seen": 4370472, "step": 22925 }, { "epoch": 11.917879417879417, "grad_norm": 0.10762808471918106, "learning_rate": 2.1014343157926465e-05, "loss": 0.1504, "num_input_tokens_seen": 4371336, "step": 22930 }, { "epoch": 11.920478170478171, "grad_norm": 1.1093361377716064, "learning_rate": 2.1003149401135694e-05, "loss": 0.1169, "num_input_tokens_seen": 4372200, "step": 22935 }, { "epoch": 11.923076923076923, "grad_norm": 1.0143297910690308, "learning_rate": 2.0991956466592342e-05, "loss": 0.1567, "num_input_tokens_seen": 4373192, "step": 22940 }, { "epoch": 11.925675675675675, "grad_norm": 0.5550302863121033, "learning_rate": 2.098076435659905e-05, "loss": 0.072, "num_input_tokens_seen": 4374088, "step": 22945 }, { "epoch": 11.928274428274428, "grad_norm": 0.5417736172676086, "learning_rate": 2.0969573073458315e-05, "loss": 0.1224, "num_input_tokens_seen": 4375048, "step": 22950 }, { "epoch": 11.930873180873181, "grad_norm": 0.8740038871765137, "learning_rate": 2.095838261947244e-05, "loss": 0.2293, "num_input_tokens_seen": 4376040, "step": 22955 }, { "epoch": 11.933471933471933, "grad_norm": 1.0072968006134033, "learning_rate": 2.0947192996943572e-05, "loss": 0.0786, "num_input_tokens_seen": 4376968, "step": 22960 }, { "epoch": 11.936070686070686, "grad_norm": 0.8398459553718567, "learning_rate": 2.0936004208173693e-05, "loss": 0.2151, "num_input_tokens_seen": 4377928, "step": 22965 }, { "epoch": 11.93866943866944, "grad_norm": 0.23866121470928192, "learning_rate": 2.0924816255464578e-05, "loss": 0.0764, "num_input_tokens_seen": 4378888, "step": 22970 }, { "epoch": 11.941268191268192, "grad_norm": 1.4679299592971802, "learning_rate": 2.0913629141117883e-05, "loss": 0.1917, "num_input_tokens_seen": 4379880, "step": 22975 }, { "epoch": 11.943866943866944, "grad_norm": 0.2634184956550598, "learning_rate": 2.090244286743504e-05, "loss": 0.1282, "num_input_tokens_seen": 4380840, "step": 22980 }, { "epoch": 11.946465696465696, "grad_norm": 0.6163161396980286, "learning_rate": 2.0891257436717353e-05, "loss": 0.1422, "num_input_tokens_seen": 4381768, "step": 22985 }, { "epoch": 11.94906444906445, "grad_norm": 0.1105194091796875, "learning_rate": 2.0880072851265913e-05, "loss": 0.1404, "num_input_tokens_seen": 4382696, "step": 22990 }, { "epoch": 11.951663201663202, "grad_norm": 0.18033386766910553, "learning_rate": 2.0868889113381675e-05, "loss": 0.0858, "num_input_tokens_seen": 4383592, "step": 22995 }, { "epoch": 11.954261954261954, "grad_norm": 0.3665599524974823, "learning_rate": 2.0857706225365382e-05, "loss": 0.1392, "num_input_tokens_seen": 4384488, "step": 23000 }, { "epoch": 11.956860706860708, "grad_norm": 0.21538576483726501, "learning_rate": 2.084652418951764e-05, "loss": 0.196, "num_input_tokens_seen": 4385448, "step": 23005 }, { "epoch": 11.95945945945946, "grad_norm": 0.7527633309364319, "learning_rate": 2.083534300813884e-05, "loss": 0.2105, "num_input_tokens_seen": 4386344, "step": 23010 }, { "epoch": 11.962058212058212, "grad_norm": 1.3195070028305054, "learning_rate": 2.0824162683529224e-05, "loss": 0.1214, "num_input_tokens_seen": 4387304, "step": 23015 }, { "epoch": 11.964656964656964, "grad_norm": 0.2135809063911438, "learning_rate": 2.0812983217988864e-05, "loss": 0.2246, "num_input_tokens_seen": 4388200, "step": 23020 }, { "epoch": 11.967255717255718, "grad_norm": 1.138630747795105, "learning_rate": 2.0801804613817627e-05, "loss": 0.3427, "num_input_tokens_seen": 4389192, "step": 23025 }, { "epoch": 11.96985446985447, "grad_norm": 0.876160740852356, "learning_rate": 2.0790626873315233e-05, "loss": 0.1036, "num_input_tokens_seen": 4390184, "step": 23030 }, { "epoch": 11.972453222453222, "grad_norm": 0.659544050693512, "learning_rate": 2.0779449998781194e-05, "loss": 0.1589, "num_input_tokens_seen": 4391176, "step": 23035 }, { "epoch": 11.975051975051976, "grad_norm": 0.7570502758026123, "learning_rate": 2.0768273992514863e-05, "loss": 0.1668, "num_input_tokens_seen": 4392200, "step": 23040 }, { "epoch": 11.977650727650728, "grad_norm": 0.3380901515483856, "learning_rate": 2.0757098856815415e-05, "loss": 0.1152, "num_input_tokens_seen": 4393160, "step": 23045 }, { "epoch": 11.98024948024948, "grad_norm": 0.8510189652442932, "learning_rate": 2.0745924593981837e-05, "loss": 0.146, "num_input_tokens_seen": 4394088, "step": 23050 }, { "epoch": 11.982848232848234, "grad_norm": 1.7651947736740112, "learning_rate": 2.0734751206312953e-05, "loss": 0.1017, "num_input_tokens_seen": 4395048, "step": 23055 }, { "epoch": 11.985446985446986, "grad_norm": 0.534778356552124, "learning_rate": 2.072357869610738e-05, "loss": 0.1876, "num_input_tokens_seen": 4396072, "step": 23060 }, { "epoch": 11.988045738045738, "grad_norm": 0.17208535969257355, "learning_rate": 2.071240706566358e-05, "loss": 0.1452, "num_input_tokens_seen": 4397064, "step": 23065 }, { "epoch": 11.99064449064449, "grad_norm": 0.9957441091537476, "learning_rate": 2.0701236317279815e-05, "loss": 0.1842, "num_input_tokens_seen": 4397992, "step": 23070 }, { "epoch": 11.993243243243244, "grad_norm": 0.6502397060394287, "learning_rate": 2.0690066453254185e-05, "loss": 0.0748, "num_input_tokens_seen": 4398920, "step": 23075 }, { "epoch": 11.995841995841996, "grad_norm": 1.2842357158660889, "learning_rate": 2.0678897475884585e-05, "loss": 0.1578, "num_input_tokens_seen": 4399848, "step": 23080 }, { "epoch": 11.998440748440748, "grad_norm": 0.4820807874202728, "learning_rate": 2.066772938746875e-05, "loss": 0.0752, "num_input_tokens_seen": 4400808, "step": 23085 }, { "epoch": 12.0, "eval_loss": 0.14636999368667603, "eval_runtime": 8.0212, "eval_samples_per_second": 106.717, "eval_steps_per_second": 26.679, "num_input_tokens_seen": 4401344, "step": 23088 }, { "epoch": 12.001039501039502, "grad_norm": 0.8490922451019287, "learning_rate": 2.0656562190304214e-05, "loss": 0.1111, "num_input_tokens_seen": 4401760, "step": 23090 }, { "epoch": 12.003638253638254, "grad_norm": 0.958345890045166, "learning_rate": 2.064539588668835e-05, "loss": 0.1106, "num_input_tokens_seen": 4402656, "step": 23095 }, { "epoch": 12.006237006237006, "grad_norm": 0.7739866375923157, "learning_rate": 2.0634230478918315e-05, "loss": 0.188, "num_input_tokens_seen": 4403552, "step": 23100 }, { "epoch": 12.008835758835758, "grad_norm": 0.9684879779815674, "learning_rate": 2.0623065969291105e-05, "loss": 0.0855, "num_input_tokens_seen": 4404544, "step": 23105 }, { "epoch": 12.011434511434512, "grad_norm": 0.3753507435321808, "learning_rate": 2.061190236010353e-05, "loss": 0.0945, "num_input_tokens_seen": 4405472, "step": 23110 }, { "epoch": 12.014033264033264, "grad_norm": 0.35484328866004944, "learning_rate": 2.0600739653652202e-05, "loss": 0.118, "num_input_tokens_seen": 4406432, "step": 23115 }, { "epoch": 12.016632016632016, "grad_norm": 1.015441656112671, "learning_rate": 2.0589577852233568e-05, "loss": 0.1746, "num_input_tokens_seen": 4407392, "step": 23120 }, { "epoch": 12.01923076923077, "grad_norm": 0.6508034467697144, "learning_rate": 2.0578416958143866e-05, "loss": 0.0737, "num_input_tokens_seen": 4408320, "step": 23125 }, { "epoch": 12.021829521829522, "grad_norm": 1.211046814918518, "learning_rate": 2.0567256973679167e-05, "loss": 0.1358, "num_input_tokens_seen": 4409248, "step": 23130 }, { "epoch": 12.024428274428274, "grad_norm": 1.813826560974121, "learning_rate": 2.0556097901135333e-05, "loss": 0.1373, "num_input_tokens_seen": 4410272, "step": 23135 }, { "epoch": 12.027027027027026, "grad_norm": 0.3188634216785431, "learning_rate": 2.0544939742808058e-05, "loss": 0.1071, "num_input_tokens_seen": 4411168, "step": 23140 }, { "epoch": 12.02962577962578, "grad_norm": 0.20132891833782196, "learning_rate": 2.053378250099285e-05, "loss": 0.1042, "num_input_tokens_seen": 4412096, "step": 23145 }, { "epoch": 12.032224532224532, "grad_norm": 0.733383059501648, "learning_rate": 2.0522626177985004e-05, "loss": 0.0668, "num_input_tokens_seen": 4412992, "step": 23150 }, { "epoch": 12.034823284823284, "grad_norm": 0.4923068881034851, "learning_rate": 2.0511470776079657e-05, "loss": 0.1176, "num_input_tokens_seen": 4413920, "step": 23155 }, { "epoch": 12.037422037422038, "grad_norm": 0.5173265933990479, "learning_rate": 2.0500316297571736e-05, "loss": 0.1503, "num_input_tokens_seen": 4414912, "step": 23160 }, { "epoch": 12.04002079002079, "grad_norm": 0.8367125988006592, "learning_rate": 2.048916274475597e-05, "loss": 0.1612, "num_input_tokens_seen": 4415904, "step": 23165 }, { "epoch": 12.042619542619542, "grad_norm": 0.9634041786193848, "learning_rate": 2.0478010119926926e-05, "loss": 0.1487, "num_input_tokens_seen": 4416896, "step": 23170 }, { "epoch": 12.045218295218294, "grad_norm": 0.7248539924621582, "learning_rate": 2.0466858425378954e-05, "loss": 0.1752, "num_input_tokens_seen": 4417856, "step": 23175 }, { "epoch": 12.047817047817048, "grad_norm": 0.2724947929382324, "learning_rate": 2.0455707663406234e-05, "loss": 0.0971, "num_input_tokens_seen": 4418848, "step": 23180 }, { "epoch": 12.0504158004158, "grad_norm": 0.14195264875888824, "learning_rate": 2.044455783630273e-05, "loss": 0.0859, "num_input_tokens_seen": 4419840, "step": 23185 }, { "epoch": 12.053014553014552, "grad_norm": 1.1656413078308105, "learning_rate": 2.0433408946362246e-05, "loss": 0.2692, "num_input_tokens_seen": 4420768, "step": 23190 }, { "epoch": 12.055613305613306, "grad_norm": 0.6738688349723816, "learning_rate": 2.0422260995878354e-05, "loss": 0.0885, "num_input_tokens_seen": 4421760, "step": 23195 }, { "epoch": 12.058212058212058, "grad_norm": 0.7527058720588684, "learning_rate": 2.0411113987144466e-05, "loss": 0.1008, "num_input_tokens_seen": 4422656, "step": 23200 }, { "epoch": 12.06081081081081, "grad_norm": 0.13446331024169922, "learning_rate": 2.039996792245378e-05, "loss": 0.0783, "num_input_tokens_seen": 4423584, "step": 23205 }, { "epoch": 12.063409563409563, "grad_norm": 0.7707799673080444, "learning_rate": 2.038882280409932e-05, "loss": 0.1256, "num_input_tokens_seen": 4424480, "step": 23210 }, { "epoch": 12.066008316008316, "grad_norm": 0.1834922730922699, "learning_rate": 2.0377678634373887e-05, "loss": 0.1422, "num_input_tokens_seen": 4425408, "step": 23215 }, { "epoch": 12.068607068607069, "grad_norm": 0.880279541015625, "learning_rate": 2.0366535415570113e-05, "loss": 0.0801, "num_input_tokens_seen": 4426336, "step": 23220 }, { "epoch": 12.07120582120582, "grad_norm": 0.46885010600090027, "learning_rate": 2.0355393149980423e-05, "loss": 0.1593, "num_input_tokens_seen": 4427264, "step": 23225 }, { "epoch": 12.073804573804575, "grad_norm": 0.24367643892765045, "learning_rate": 2.0344251839897034e-05, "loss": 0.2511, "num_input_tokens_seen": 4428128, "step": 23230 }, { "epoch": 12.076403326403327, "grad_norm": 0.26687559485435486, "learning_rate": 2.0333111487611992e-05, "loss": 0.116, "num_input_tokens_seen": 4429120, "step": 23235 }, { "epoch": 12.079002079002079, "grad_norm": 0.5423389673233032, "learning_rate": 2.0321972095417126e-05, "loss": 0.0746, "num_input_tokens_seen": 4430144, "step": 23240 }, { "epoch": 12.08160083160083, "grad_norm": 0.8827858567237854, "learning_rate": 2.031083366560408e-05, "loss": 0.2086, "num_input_tokens_seen": 4431104, "step": 23245 }, { "epoch": 12.084199584199585, "grad_norm": 1.023364543914795, "learning_rate": 2.0299696200464286e-05, "loss": 0.2508, "num_input_tokens_seen": 4432128, "step": 23250 }, { "epoch": 12.086798336798337, "grad_norm": 0.11591264605522156, "learning_rate": 2.0288559702288998e-05, "loss": 0.0285, "num_input_tokens_seen": 4433056, "step": 23255 }, { "epoch": 12.089397089397089, "grad_norm": 1.3270869255065918, "learning_rate": 2.0277424173369246e-05, "loss": 0.2387, "num_input_tokens_seen": 4434048, "step": 23260 }, { "epoch": 12.091995841995843, "grad_norm": 0.374724805355072, "learning_rate": 2.026628961599588e-05, "loss": 0.2353, "num_input_tokens_seen": 4435072, "step": 23265 }, { "epoch": 12.094594594594595, "grad_norm": 0.6469802856445312, "learning_rate": 2.0255156032459555e-05, "loss": 0.1126, "num_input_tokens_seen": 4435968, "step": 23270 }, { "epoch": 12.097193347193347, "grad_norm": 0.9952681660652161, "learning_rate": 2.0244023425050692e-05, "loss": 0.1893, "num_input_tokens_seen": 4436896, "step": 23275 }, { "epoch": 12.0997920997921, "grad_norm": 0.3216904401779175, "learning_rate": 2.0232891796059554e-05, "loss": 0.1164, "num_input_tokens_seen": 4437856, "step": 23280 }, { "epoch": 12.102390852390853, "grad_norm": 0.6957006454467773, "learning_rate": 2.022176114777617e-05, "loss": 0.0708, "num_input_tokens_seen": 4438816, "step": 23285 }, { "epoch": 12.104989604989605, "grad_norm": 1.3795299530029297, "learning_rate": 2.0210631482490384e-05, "loss": 0.1363, "num_input_tokens_seen": 4439744, "step": 23290 }, { "epoch": 12.107588357588357, "grad_norm": 0.20667873322963715, "learning_rate": 2.019950280249184e-05, "loss": 0.1617, "num_input_tokens_seen": 4440672, "step": 23295 }, { "epoch": 12.11018711018711, "grad_norm": 0.10166648030281067, "learning_rate": 2.0188375110069956e-05, "loss": 0.0802, "num_input_tokens_seen": 4441632, "step": 23300 }, { "epoch": 12.112785862785863, "grad_norm": 1.8947051763534546, "learning_rate": 2.0177248407513973e-05, "loss": 0.2419, "num_input_tokens_seen": 4442496, "step": 23305 }, { "epoch": 12.115384615384615, "grad_norm": 0.7507355809211731, "learning_rate": 2.016612269711292e-05, "loss": 0.184, "num_input_tokens_seen": 4443456, "step": 23310 }, { "epoch": 12.117983367983369, "grad_norm": 1.2669600248336792, "learning_rate": 2.015499798115562e-05, "loss": 0.1467, "num_input_tokens_seen": 4444448, "step": 23315 }, { "epoch": 12.120582120582121, "grad_norm": 0.12163732945919037, "learning_rate": 2.0143874261930686e-05, "loss": 0.2, "num_input_tokens_seen": 4445344, "step": 23320 }, { "epoch": 12.123180873180873, "grad_norm": 1.0077308416366577, "learning_rate": 2.0132751541726546e-05, "loss": 0.1669, "num_input_tokens_seen": 4446272, "step": 23325 }, { "epoch": 12.125779625779625, "grad_norm": 0.6712356209754944, "learning_rate": 2.0121629822831396e-05, "loss": 0.1286, "num_input_tokens_seen": 4447296, "step": 23330 }, { "epoch": 12.128378378378379, "grad_norm": 1.4439315795898438, "learning_rate": 2.0110509107533243e-05, "loss": 0.1431, "num_input_tokens_seen": 4448288, "step": 23335 }, { "epoch": 12.130977130977131, "grad_norm": 0.4522207975387573, "learning_rate": 2.009938939811988e-05, "loss": 0.0649, "num_input_tokens_seen": 4449216, "step": 23340 }, { "epoch": 12.133575883575883, "grad_norm": 0.4027131497859955, "learning_rate": 2.0088270696878905e-05, "loss": 0.093, "num_input_tokens_seen": 4450208, "step": 23345 }, { "epoch": 12.136174636174637, "grad_norm": 0.5470183491706848, "learning_rate": 2.0077153006097688e-05, "loss": 0.1775, "num_input_tokens_seen": 4451168, "step": 23350 }, { "epoch": 12.138773388773389, "grad_norm": 0.713134229183197, "learning_rate": 2.0066036328063415e-05, "loss": 0.1245, "num_input_tokens_seen": 4452160, "step": 23355 }, { "epoch": 12.141372141372141, "grad_norm": 0.18743981420993805, "learning_rate": 2.005492066506305e-05, "loss": 0.0684, "num_input_tokens_seen": 4453088, "step": 23360 }, { "epoch": 12.143970893970893, "grad_norm": 0.798452615737915, "learning_rate": 2.004380601938334e-05, "loss": 0.1554, "num_input_tokens_seen": 4453984, "step": 23365 }, { "epoch": 12.146569646569647, "grad_norm": 0.12467189878225327, "learning_rate": 2.0032692393310835e-05, "loss": 0.0502, "num_input_tokens_seen": 4454880, "step": 23370 }, { "epoch": 12.1491683991684, "grad_norm": 0.24384599924087524, "learning_rate": 2.0021579789131876e-05, "loss": 0.1097, "num_input_tokens_seen": 4455808, "step": 23375 }, { "epoch": 12.151767151767151, "grad_norm": 0.498149037361145, "learning_rate": 2.001046820913259e-05, "loss": 0.0285, "num_input_tokens_seen": 4456704, "step": 23380 }, { "epoch": 12.154365904365905, "grad_norm": 1.0029330253601074, "learning_rate": 1.9999357655598893e-05, "loss": 0.1911, "num_input_tokens_seen": 4457632, "step": 23385 }, { "epoch": 12.156964656964657, "grad_norm": 0.4356977641582489, "learning_rate": 1.998824813081649e-05, "loss": 0.0565, "num_input_tokens_seen": 4458688, "step": 23390 }, { "epoch": 12.15956340956341, "grad_norm": 0.24327675998210907, "learning_rate": 1.997713963707088e-05, "loss": 0.1442, "num_input_tokens_seen": 4459616, "step": 23395 }, { "epoch": 12.162162162162161, "grad_norm": 0.998049795627594, "learning_rate": 1.9966032176647338e-05, "loss": 0.2336, "num_input_tokens_seen": 4460608, "step": 23400 }, { "epoch": 12.164760914760915, "grad_norm": 0.2493637651205063, "learning_rate": 1.9954925751830943e-05, "loss": 0.0812, "num_input_tokens_seen": 4461568, "step": 23405 }, { "epoch": 12.167359667359667, "grad_norm": 0.6302502155303955, "learning_rate": 1.9943820364906535e-05, "loss": 0.1303, "num_input_tokens_seen": 4462496, "step": 23410 }, { "epoch": 12.16995841995842, "grad_norm": 0.14633844792842865, "learning_rate": 1.9932716018158777e-05, "loss": 0.133, "num_input_tokens_seen": 4463424, "step": 23415 }, { "epoch": 12.172557172557173, "grad_norm": 0.5952870845794678, "learning_rate": 1.992161271387208e-05, "loss": 0.1767, "num_input_tokens_seen": 4464416, "step": 23420 }, { "epoch": 12.175155925155925, "grad_norm": 0.7651233077049255, "learning_rate": 1.9910510454330674e-05, "loss": 0.1394, "num_input_tokens_seen": 4465344, "step": 23425 }, { "epoch": 12.177754677754677, "grad_norm": 0.10695645958185196, "learning_rate": 1.9899409241818548e-05, "loss": 0.0704, "num_input_tokens_seen": 4466240, "step": 23430 }, { "epoch": 12.18035343035343, "grad_norm": 0.262850284576416, "learning_rate": 1.988830907861948e-05, "loss": 0.0637, "num_input_tokens_seen": 4467136, "step": 23435 }, { "epoch": 12.182952182952183, "grad_norm": 0.5055703520774841, "learning_rate": 1.9877209967017053e-05, "loss": 0.1417, "num_input_tokens_seen": 4468128, "step": 23440 }, { "epoch": 12.185550935550935, "grad_norm": 1.0350618362426758, "learning_rate": 1.9866111909294605e-05, "loss": 0.1541, "num_input_tokens_seen": 4469152, "step": 23445 }, { "epoch": 12.188149688149688, "grad_norm": 0.2161925882101059, "learning_rate": 1.9855014907735283e-05, "loss": 0.0876, "num_input_tokens_seen": 4470080, "step": 23450 }, { "epoch": 12.190748440748441, "grad_norm": 0.9998037815093994, "learning_rate": 1.9843918964621995e-05, "loss": 0.2437, "num_input_tokens_seen": 4471008, "step": 23455 }, { "epoch": 12.193347193347194, "grad_norm": 0.7152946591377258, "learning_rate": 1.983282408223745e-05, "loss": 0.1849, "num_input_tokens_seen": 4471936, "step": 23460 }, { "epoch": 12.195945945945946, "grad_norm": 0.2211332619190216, "learning_rate": 1.9821730262864118e-05, "loss": 0.1255, "num_input_tokens_seen": 4472896, "step": 23465 }, { "epoch": 12.198544698544698, "grad_norm": 0.10648494213819504, "learning_rate": 1.981063750878428e-05, "loss": 0.1071, "num_input_tokens_seen": 4473888, "step": 23470 }, { "epoch": 12.201143451143452, "grad_norm": 0.39570510387420654, "learning_rate": 1.9799545822279957e-05, "loss": 0.0875, "num_input_tokens_seen": 4474880, "step": 23475 }, { "epoch": 12.203742203742204, "grad_norm": 0.16268357634544373, "learning_rate": 1.9788455205632995e-05, "loss": 0.1569, "num_input_tokens_seen": 4475872, "step": 23480 }, { "epoch": 12.206340956340956, "grad_norm": 0.4236225187778473, "learning_rate": 1.977736566112498e-05, "loss": 0.123, "num_input_tokens_seen": 4476864, "step": 23485 }, { "epoch": 12.20893970893971, "grad_norm": 0.7653684020042419, "learning_rate": 1.9766277191037315e-05, "loss": 0.1066, "num_input_tokens_seen": 4477792, "step": 23490 }, { "epoch": 12.211538461538462, "grad_norm": 0.08436805754899979, "learning_rate": 1.9755189797651147e-05, "loss": 0.1295, "num_input_tokens_seen": 4478720, "step": 23495 }, { "epoch": 12.214137214137214, "grad_norm": 0.2532823383808136, "learning_rate": 1.9744103483247415e-05, "loss": 0.1193, "num_input_tokens_seen": 4479648, "step": 23500 }, { "epoch": 12.216735966735968, "grad_norm": 1.016541600227356, "learning_rate": 1.973301825010685e-05, "loss": 0.0901, "num_input_tokens_seen": 4480544, "step": 23505 }, { "epoch": 12.21933471933472, "grad_norm": 0.09086650609970093, "learning_rate": 1.9721934100509937e-05, "loss": 0.0473, "num_input_tokens_seen": 4481408, "step": 23510 }, { "epoch": 12.221933471933472, "grad_norm": 2.156580924987793, "learning_rate": 1.9710851036736955e-05, "loss": 0.099, "num_input_tokens_seen": 4482304, "step": 23515 }, { "epoch": 12.224532224532224, "grad_norm": 1.9248894453048706, "learning_rate": 1.9699769061067964e-05, "loss": 0.1874, "num_input_tokens_seen": 4483232, "step": 23520 }, { "epoch": 12.227130977130978, "grad_norm": 0.4080572724342346, "learning_rate": 1.9688688175782773e-05, "loss": 0.0803, "num_input_tokens_seen": 4484160, "step": 23525 }, { "epoch": 12.22972972972973, "grad_norm": 0.49502429366111755, "learning_rate": 1.9677608383160996e-05, "loss": 0.1988, "num_input_tokens_seen": 4485024, "step": 23530 }, { "epoch": 12.232328482328482, "grad_norm": 0.226585254073143, "learning_rate": 1.9666529685482004e-05, "loss": 0.0448, "num_input_tokens_seen": 4485984, "step": 23535 }, { "epoch": 12.234927234927236, "grad_norm": 0.18278445303440094, "learning_rate": 1.9655452085024958e-05, "loss": 0.1555, "num_input_tokens_seen": 4486912, "step": 23540 }, { "epoch": 12.237525987525988, "grad_norm": 0.867986798286438, "learning_rate": 1.9644375584068776e-05, "loss": 0.0798, "num_input_tokens_seen": 4487904, "step": 23545 }, { "epoch": 12.24012474012474, "grad_norm": 0.7179014086723328, "learning_rate": 1.963330018489216e-05, "loss": 0.1378, "num_input_tokens_seen": 4488960, "step": 23550 }, { "epoch": 12.242723492723492, "grad_norm": 0.7618197798728943, "learning_rate": 1.9622225889773594e-05, "loss": 0.1039, "num_input_tokens_seen": 4489920, "step": 23555 }, { "epoch": 12.245322245322246, "grad_norm": 0.7408340573310852, "learning_rate": 1.9611152700991302e-05, "loss": 0.1599, "num_input_tokens_seen": 4490848, "step": 23560 }, { "epoch": 12.247920997920998, "grad_norm": 0.7434263825416565, "learning_rate": 1.9600080620823323e-05, "loss": 0.1299, "num_input_tokens_seen": 4491840, "step": 23565 }, { "epoch": 12.25051975051975, "grad_norm": 0.9129399061203003, "learning_rate": 1.958900965154743e-05, "loss": 0.0967, "num_input_tokens_seen": 4492768, "step": 23570 }, { "epoch": 12.253118503118504, "grad_norm": 0.2694189250469208, "learning_rate": 1.9577939795441205e-05, "loss": 0.1032, "num_input_tokens_seen": 4493696, "step": 23575 }, { "epoch": 12.255717255717256, "grad_norm": 0.5107887387275696, "learning_rate": 1.9566871054781966e-05, "loss": 0.2317, "num_input_tokens_seen": 4494688, "step": 23580 }, { "epoch": 12.258316008316008, "grad_norm": 1.337133526802063, "learning_rate": 1.9555803431846824e-05, "loss": 0.1334, "num_input_tokens_seen": 4495776, "step": 23585 }, { "epoch": 12.26091476091476, "grad_norm": 0.664078414440155, "learning_rate": 1.9544736928912652e-05, "loss": 0.107, "num_input_tokens_seen": 4496800, "step": 23590 }, { "epoch": 12.263513513513514, "grad_norm": 0.27407306432724, "learning_rate": 1.9533671548256095e-05, "loss": 0.0947, "num_input_tokens_seen": 4497792, "step": 23595 }, { "epoch": 12.266112266112266, "grad_norm": 0.3883967101573944, "learning_rate": 1.9522607292153557e-05, "loss": 0.1554, "num_input_tokens_seen": 4498816, "step": 23600 }, { "epoch": 12.268711018711018, "grad_norm": 0.6339928507804871, "learning_rate": 1.951154416288123e-05, "loss": 0.122, "num_input_tokens_seen": 4499744, "step": 23605 }, { "epoch": 12.271309771309772, "grad_norm": 0.6234005689620972, "learning_rate": 1.9500482162715054e-05, "loss": 0.1889, "num_input_tokens_seen": 4500672, "step": 23610 }, { "epoch": 12.273908523908524, "grad_norm": 0.2684011161327362, "learning_rate": 1.9489421293930754e-05, "loss": 0.2158, "num_input_tokens_seen": 4501600, "step": 23615 }, { "epoch": 12.276507276507276, "grad_norm": 1.0097761154174805, "learning_rate": 1.9478361558803817e-05, "loss": 0.1009, "num_input_tokens_seen": 4502560, "step": 23620 }, { "epoch": 12.279106029106028, "grad_norm": 0.09371107071638107, "learning_rate": 1.946730295960948e-05, "loss": 0.1418, "num_input_tokens_seen": 4503552, "step": 23625 }, { "epoch": 12.281704781704782, "grad_norm": 1.1151245832443237, "learning_rate": 1.9456245498622765e-05, "loss": 0.1701, "num_input_tokens_seen": 4504512, "step": 23630 }, { "epoch": 12.284303534303534, "grad_norm": 1.3942062854766846, "learning_rate": 1.9445189178118464e-05, "loss": 0.1882, "num_input_tokens_seen": 4505472, "step": 23635 }, { "epoch": 12.286902286902286, "grad_norm": 0.09049534797668457, "learning_rate": 1.9434134000371116e-05, "loss": 0.0557, "num_input_tokens_seen": 4506464, "step": 23640 }, { "epoch": 12.28950103950104, "grad_norm": 0.5370703935623169, "learning_rate": 1.9423079967655043e-05, "loss": 0.1609, "num_input_tokens_seen": 4507392, "step": 23645 }, { "epoch": 12.292099792099792, "grad_norm": 0.13568562269210815, "learning_rate": 1.9412027082244314e-05, "loss": 0.0693, "num_input_tokens_seen": 4508288, "step": 23650 }, { "epoch": 12.294698544698544, "grad_norm": 1.0986878871917725, "learning_rate": 1.940097534641278e-05, "loss": 0.157, "num_input_tokens_seen": 4509248, "step": 23655 }, { "epoch": 12.297297297297296, "grad_norm": 0.7420035004615784, "learning_rate": 1.9389924762434035e-05, "loss": 0.1696, "num_input_tokens_seen": 4510208, "step": 23660 }, { "epoch": 12.29989604989605, "grad_norm": 1.0194288492202759, "learning_rate": 1.9378875332581463e-05, "loss": 0.1806, "num_input_tokens_seen": 4511200, "step": 23665 }, { "epoch": 12.302494802494802, "grad_norm": 1.6690599918365479, "learning_rate": 1.936782705912818e-05, "loss": 0.1495, "num_input_tokens_seen": 4512160, "step": 23670 }, { "epoch": 12.305093555093555, "grad_norm": 0.2500310242176056, "learning_rate": 1.935677994434709e-05, "loss": 0.1098, "num_input_tokens_seen": 4513088, "step": 23675 }, { "epoch": 12.307692307692308, "grad_norm": 0.7761510610580444, "learning_rate": 1.9345733990510835e-05, "loss": 0.1976, "num_input_tokens_seen": 4514144, "step": 23680 }, { "epoch": 12.31029106029106, "grad_norm": 0.9436663389205933, "learning_rate": 1.9334689199891854e-05, "loss": 0.2327, "num_input_tokens_seen": 4515200, "step": 23685 }, { "epoch": 12.312889812889813, "grad_norm": 0.19647422432899475, "learning_rate": 1.9323645574762303e-05, "loss": 0.0833, "num_input_tokens_seen": 4516160, "step": 23690 }, { "epoch": 12.315488565488565, "grad_norm": 0.3113352358341217, "learning_rate": 1.9312603117394118e-05, "loss": 0.0774, "num_input_tokens_seen": 4517088, "step": 23695 }, { "epoch": 12.318087318087318, "grad_norm": 0.7446953654289246, "learning_rate": 1.930156183005901e-05, "loss": 0.1539, "num_input_tokens_seen": 4518048, "step": 23700 }, { "epoch": 12.32068607068607, "grad_norm": 1.5691941976547241, "learning_rate": 1.929052171502842e-05, "loss": 0.117, "num_input_tokens_seen": 4519008, "step": 23705 }, { "epoch": 12.323284823284823, "grad_norm": 0.1763860583305359, "learning_rate": 1.9279482774573573e-05, "loss": 0.0408, "num_input_tokens_seen": 4519968, "step": 23710 }, { "epoch": 12.325883575883577, "grad_norm": 0.6962054967880249, "learning_rate": 1.9268445010965434e-05, "loss": 0.1303, "num_input_tokens_seen": 4520928, "step": 23715 }, { "epoch": 12.328482328482329, "grad_norm": 1.085131287574768, "learning_rate": 1.9257408426474748e-05, "loss": 0.199, "num_input_tokens_seen": 4521888, "step": 23720 }, { "epoch": 12.33108108108108, "grad_norm": 0.7180739045143127, "learning_rate": 1.924637302337199e-05, "loss": 0.082, "num_input_tokens_seen": 4522880, "step": 23725 }, { "epoch": 12.333679833679835, "grad_norm": 0.5292729139328003, "learning_rate": 1.923533880392741e-05, "loss": 0.118, "num_input_tokens_seen": 4523872, "step": 23730 }, { "epoch": 12.336278586278587, "grad_norm": 0.5918347239494324, "learning_rate": 1.922430577041101e-05, "loss": 0.2661, "num_input_tokens_seen": 4524864, "step": 23735 }, { "epoch": 12.338877338877339, "grad_norm": 0.2355911135673523, "learning_rate": 1.9213273925092546e-05, "loss": 0.172, "num_input_tokens_seen": 4525728, "step": 23740 }, { "epoch": 12.34147609147609, "grad_norm": 0.4853992760181427, "learning_rate": 1.9202243270241533e-05, "loss": 0.134, "num_input_tokens_seen": 4526656, "step": 23745 }, { "epoch": 12.344074844074845, "grad_norm": 0.12713094055652618, "learning_rate": 1.919121380812725e-05, "loss": 0.0883, "num_input_tokens_seen": 4527584, "step": 23750 }, { "epoch": 12.346673596673597, "grad_norm": 0.2578716576099396, "learning_rate": 1.9180185541018695e-05, "loss": 0.1166, "num_input_tokens_seen": 4528544, "step": 23755 }, { "epoch": 12.349272349272349, "grad_norm": 0.8504797220230103, "learning_rate": 1.9169158471184674e-05, "loss": 0.0615, "num_input_tokens_seen": 4529504, "step": 23760 }, { "epoch": 12.351871101871103, "grad_norm": 1.1136795282363892, "learning_rate": 1.915813260089369e-05, "loss": 0.158, "num_input_tokens_seen": 4530432, "step": 23765 }, { "epoch": 12.354469854469855, "grad_norm": 0.16636162996292114, "learning_rate": 1.9147107932414052e-05, "loss": 0.0649, "num_input_tokens_seen": 4531328, "step": 23770 }, { "epoch": 12.357068607068607, "grad_norm": 0.12880535423755646, "learning_rate": 1.9136084468013778e-05, "loss": 0.1676, "num_input_tokens_seen": 4532352, "step": 23775 }, { "epoch": 12.359667359667359, "grad_norm": 1.2182294130325317, "learning_rate": 1.9125062209960677e-05, "loss": 0.1342, "num_input_tokens_seen": 4533216, "step": 23780 }, { "epoch": 12.362266112266113, "grad_norm": 3.3629651069641113, "learning_rate": 1.9114041160522267e-05, "loss": 0.2227, "num_input_tokens_seen": 4534112, "step": 23785 }, { "epoch": 12.364864864864865, "grad_norm": 0.7457774877548218, "learning_rate": 1.910302132196586e-05, "loss": 0.1235, "num_input_tokens_seen": 4535040, "step": 23790 }, { "epoch": 12.367463617463617, "grad_norm": 0.4551204741001129, "learning_rate": 1.9092002696558488e-05, "loss": 0.1598, "num_input_tokens_seen": 4536000, "step": 23795 }, { "epoch": 12.37006237006237, "grad_norm": 0.4127981960773468, "learning_rate": 1.908098528656695e-05, "loss": 0.0693, "num_input_tokens_seen": 4536928, "step": 23800 }, { "epoch": 12.372661122661123, "grad_norm": 0.4300064444541931, "learning_rate": 1.906996909425779e-05, "loss": 0.0557, "num_input_tokens_seen": 4537952, "step": 23805 }, { "epoch": 12.375259875259875, "grad_norm": 0.9401205778121948, "learning_rate": 1.90589541218973e-05, "loss": 0.1759, "num_input_tokens_seen": 4538976, "step": 23810 }, { "epoch": 12.377858627858627, "grad_norm": 0.7954080104827881, "learning_rate": 1.904794037175153e-05, "loss": 0.1012, "num_input_tokens_seen": 4539936, "step": 23815 }, { "epoch": 12.380457380457381, "grad_norm": 0.25377169251441956, "learning_rate": 1.903692784608625e-05, "loss": 0.0745, "num_input_tokens_seen": 4540864, "step": 23820 }, { "epoch": 12.383056133056133, "grad_norm": 0.9766589403152466, "learning_rate": 1.9025916547167018e-05, "loss": 0.1545, "num_input_tokens_seen": 4541824, "step": 23825 }, { "epoch": 12.385654885654885, "grad_norm": 2.6196682453155518, "learning_rate": 1.9014906477259108e-05, "loss": 0.2592, "num_input_tokens_seen": 4542784, "step": 23830 }, { "epoch": 12.388253638253639, "grad_norm": 0.5167152881622314, "learning_rate": 1.900389763862757e-05, "loss": 0.062, "num_input_tokens_seen": 4543808, "step": 23835 }, { "epoch": 12.390852390852391, "grad_norm": 1.3960427045822144, "learning_rate": 1.8992890033537163e-05, "loss": 0.1292, "num_input_tokens_seen": 4544736, "step": 23840 }, { "epoch": 12.393451143451143, "grad_norm": 0.2525588870048523, "learning_rate": 1.8981883664252438e-05, "loss": 0.213, "num_input_tokens_seen": 4545760, "step": 23845 }, { "epoch": 12.396049896049895, "grad_norm": 1.0063585042953491, "learning_rate": 1.8970878533037645e-05, "loss": 0.0954, "num_input_tokens_seen": 4546752, "step": 23850 }, { "epoch": 12.39864864864865, "grad_norm": 0.6001266241073608, "learning_rate": 1.8959874642156816e-05, "loss": 0.1709, "num_input_tokens_seen": 4547712, "step": 23855 }, { "epoch": 12.401247401247401, "grad_norm": 1.3686970472335815, "learning_rate": 1.8948871993873703e-05, "loss": 0.1983, "num_input_tokens_seen": 4548704, "step": 23860 }, { "epoch": 12.403846153846153, "grad_norm": 0.553684413433075, "learning_rate": 1.893787059045183e-05, "loss": 0.1591, "num_input_tokens_seen": 4549600, "step": 23865 }, { "epoch": 12.406444906444907, "grad_norm": 1.0388723611831665, "learning_rate": 1.8926870434154426e-05, "loss": 0.1444, "num_input_tokens_seen": 4550464, "step": 23870 }, { "epoch": 12.40904365904366, "grad_norm": 1.0920207500457764, "learning_rate": 1.89158715272445e-05, "loss": 0.0987, "num_input_tokens_seen": 4551360, "step": 23875 }, { "epoch": 12.411642411642411, "grad_norm": 0.1783185750246048, "learning_rate": 1.8904873871984795e-05, "loss": 0.088, "num_input_tokens_seen": 4552320, "step": 23880 }, { "epoch": 12.414241164241163, "grad_norm": 0.8944645524024963, "learning_rate": 1.8893877470637776e-05, "loss": 0.1747, "num_input_tokens_seen": 4553312, "step": 23885 }, { "epoch": 12.416839916839917, "grad_norm": 0.35693231225013733, "learning_rate": 1.8882882325465663e-05, "loss": 0.1251, "num_input_tokens_seen": 4554240, "step": 23890 }, { "epoch": 12.41943866943867, "grad_norm": 0.21073727309703827, "learning_rate": 1.887188843873043e-05, "loss": 0.0688, "num_input_tokens_seen": 4555136, "step": 23895 }, { "epoch": 12.422037422037421, "grad_norm": 0.13736474514007568, "learning_rate": 1.8860895812693778e-05, "loss": 0.122, "num_input_tokens_seen": 4556096, "step": 23900 }, { "epoch": 12.424636174636175, "grad_norm": 0.2826419770717621, "learning_rate": 1.8849904449617155e-05, "loss": 0.042, "num_input_tokens_seen": 4556928, "step": 23905 }, { "epoch": 12.427234927234927, "grad_norm": 0.20867504179477692, "learning_rate": 1.8838914351761736e-05, "loss": 0.1967, "num_input_tokens_seen": 4557856, "step": 23910 }, { "epoch": 12.42983367983368, "grad_norm": 0.7661455869674683, "learning_rate": 1.882792552138846e-05, "loss": 0.0506, "num_input_tokens_seen": 4558784, "step": 23915 }, { "epoch": 12.432432432432432, "grad_norm": 0.15003272891044617, "learning_rate": 1.881693796075798e-05, "loss": 0.0748, "num_input_tokens_seen": 4559712, "step": 23920 }, { "epoch": 12.435031185031185, "grad_norm": 0.06537748873233795, "learning_rate": 1.8805951672130705e-05, "loss": 0.1441, "num_input_tokens_seen": 4560672, "step": 23925 }, { "epoch": 12.437629937629938, "grad_norm": 0.5936759114265442, "learning_rate": 1.879496665776678e-05, "loss": 0.1182, "num_input_tokens_seen": 4561632, "step": 23930 }, { "epoch": 12.44022869022869, "grad_norm": 0.8191075921058655, "learning_rate": 1.8783982919926074e-05, "loss": 0.1299, "num_input_tokens_seen": 4562592, "step": 23935 }, { "epoch": 12.442827442827443, "grad_norm": 1.0375244617462158, "learning_rate": 1.8773000460868205e-05, "loss": 0.122, "num_input_tokens_seen": 4563488, "step": 23940 }, { "epoch": 12.445426195426196, "grad_norm": 0.5525816679000854, "learning_rate": 1.8762019282852544e-05, "loss": 0.098, "num_input_tokens_seen": 4564448, "step": 23945 }, { "epoch": 12.448024948024948, "grad_norm": 0.7236865162849426, "learning_rate": 1.875103938813816e-05, "loss": 0.1264, "num_input_tokens_seen": 4565440, "step": 23950 }, { "epoch": 12.450623700623701, "grad_norm": 0.36628422141075134, "learning_rate": 1.8740060778983883e-05, "loss": 0.1228, "num_input_tokens_seen": 4566368, "step": 23955 }, { "epoch": 12.453222453222454, "grad_norm": 0.3495146334171295, "learning_rate": 1.8729083457648283e-05, "loss": 0.0933, "num_input_tokens_seen": 4567296, "step": 23960 }, { "epoch": 12.455821205821206, "grad_norm": 0.8285994529724121, "learning_rate": 1.871810742638965e-05, "loss": 0.1312, "num_input_tokens_seen": 4568256, "step": 23965 }, { "epoch": 12.458419958419958, "grad_norm": 0.5712424516677856, "learning_rate": 1.8707132687466018e-05, "loss": 0.2006, "num_input_tokens_seen": 4569184, "step": 23970 }, { "epoch": 12.461018711018712, "grad_norm": 0.8713810443878174, "learning_rate": 1.8696159243135143e-05, "loss": 0.1586, "num_input_tokens_seen": 4570144, "step": 23975 }, { "epoch": 12.463617463617464, "grad_norm": 0.531471848487854, "learning_rate": 1.868518709565454e-05, "loss": 0.0918, "num_input_tokens_seen": 4571104, "step": 23980 }, { "epoch": 12.466216216216216, "grad_norm": 0.35257863998413086, "learning_rate": 1.867421624728143e-05, "loss": 0.0957, "num_input_tokens_seen": 4572096, "step": 23985 }, { "epoch": 12.46881496881497, "grad_norm": 0.08695608377456665, "learning_rate": 1.8663246700272783e-05, "loss": 0.1338, "num_input_tokens_seen": 4573088, "step": 23990 }, { "epoch": 12.471413721413722, "grad_norm": 0.1662154197692871, "learning_rate": 1.865227845688529e-05, "loss": 0.1479, "num_input_tokens_seen": 4574016, "step": 23995 }, { "epoch": 12.474012474012474, "grad_norm": 0.6175655126571655, "learning_rate": 1.8641311519375386e-05, "loss": 0.1425, "num_input_tokens_seen": 4574912, "step": 24000 }, { "epoch": 12.476611226611226, "grad_norm": 1.3395248651504517, "learning_rate": 1.8630345889999232e-05, "loss": 0.1595, "num_input_tokens_seen": 4575808, "step": 24005 }, { "epoch": 12.47920997920998, "grad_norm": 0.3827973008155823, "learning_rate": 1.8619381571012724e-05, "loss": 0.1361, "num_input_tokens_seen": 4576800, "step": 24010 }, { "epoch": 12.481808731808732, "grad_norm": 0.6444174647331238, "learning_rate": 1.8608418564671467e-05, "loss": 0.1382, "num_input_tokens_seen": 4577760, "step": 24015 }, { "epoch": 12.484407484407484, "grad_norm": 0.5165701508522034, "learning_rate": 1.859745687323083e-05, "loss": 0.2067, "num_input_tokens_seen": 4578720, "step": 24020 }, { "epoch": 12.487006237006238, "grad_norm": 0.5477184653282166, "learning_rate": 1.8586496498945877e-05, "loss": 0.1003, "num_input_tokens_seen": 4579712, "step": 24025 }, { "epoch": 12.48960498960499, "grad_norm": 1.0124552249908447, "learning_rate": 1.8575537444071432e-05, "loss": 0.1265, "num_input_tokens_seen": 4580704, "step": 24030 }, { "epoch": 12.492203742203742, "grad_norm": 0.4915532171726227, "learning_rate": 1.8564579710862028e-05, "loss": 0.1942, "num_input_tokens_seen": 4581600, "step": 24035 }, { "epoch": 12.494802494802494, "grad_norm": 0.28121912479400635, "learning_rate": 1.8553623301571936e-05, "loss": 0.0549, "num_input_tokens_seen": 4582528, "step": 24040 }, { "epoch": 12.497401247401248, "grad_norm": 0.513196587562561, "learning_rate": 1.8542668218455144e-05, "loss": 0.0695, "num_input_tokens_seen": 4583456, "step": 24045 }, { "epoch": 12.5, "grad_norm": 0.6657730937004089, "learning_rate": 1.8531714463765388e-05, "loss": 0.1356, "num_input_tokens_seen": 4584416, "step": 24050 }, { "epoch": 12.502598752598752, "grad_norm": 1.2714921236038208, "learning_rate": 1.852076203975609e-05, "loss": 0.1944, "num_input_tokens_seen": 4585344, "step": 24055 }, { "epoch": 12.505197505197506, "grad_norm": 1.0890374183654785, "learning_rate": 1.8509810948680453e-05, "loss": 0.2677, "num_input_tokens_seen": 4586272, "step": 24060 }, { "epoch": 12.507796257796258, "grad_norm": 1.8463268280029297, "learning_rate": 1.849886119279136e-05, "loss": 0.1828, "num_input_tokens_seen": 4587200, "step": 24065 }, { "epoch": 12.51039501039501, "grad_norm": 0.6403983235359192, "learning_rate": 1.8487912774341443e-05, "loss": 0.1089, "num_input_tokens_seen": 4588160, "step": 24070 }, { "epoch": 12.512993762993762, "grad_norm": 1.129788875579834, "learning_rate": 1.847696569558306e-05, "loss": 0.2635, "num_input_tokens_seen": 4589056, "step": 24075 }, { "epoch": 12.515592515592516, "grad_norm": 0.7224568128585815, "learning_rate": 1.8466019958768263e-05, "loss": 0.1726, "num_input_tokens_seen": 4590048, "step": 24080 }, { "epoch": 12.518191268191268, "grad_norm": 0.6579567193984985, "learning_rate": 1.845507556614887e-05, "loss": 0.1298, "num_input_tokens_seen": 4591040, "step": 24085 }, { "epoch": 12.52079002079002, "grad_norm": 0.8739511966705322, "learning_rate": 1.84441325199764e-05, "loss": 0.1635, "num_input_tokens_seen": 4591968, "step": 24090 }, { "epoch": 12.523388773388774, "grad_norm": 0.9053895473480225, "learning_rate": 1.8433190822502095e-05, "loss": 0.2349, "num_input_tokens_seen": 4592928, "step": 24095 }, { "epoch": 12.525987525987526, "grad_norm": 0.1385926455259323, "learning_rate": 1.8422250475976922e-05, "loss": 0.0965, "num_input_tokens_seen": 4593856, "step": 24100 }, { "epoch": 12.528586278586278, "grad_norm": 0.07047880440950394, "learning_rate": 1.841131148265158e-05, "loss": 0.1613, "num_input_tokens_seen": 4594912, "step": 24105 }, { "epoch": 12.53118503118503, "grad_norm": 0.5646631717681885, "learning_rate": 1.8400373844776465e-05, "loss": 0.1397, "num_input_tokens_seen": 4595936, "step": 24110 }, { "epoch": 12.533783783783784, "grad_norm": 0.24792888760566711, "learning_rate": 1.838943756460172e-05, "loss": 0.1153, "num_input_tokens_seen": 4596896, "step": 24115 }, { "epoch": 12.536382536382536, "grad_norm": 0.11088478565216064, "learning_rate": 1.8378502644377203e-05, "loss": 0.0334, "num_input_tokens_seen": 4597856, "step": 24120 }, { "epoch": 12.538981288981288, "grad_norm": 0.27548274397850037, "learning_rate": 1.8367569086352483e-05, "loss": 0.0997, "num_input_tokens_seen": 4598816, "step": 24125 }, { "epoch": 12.541580041580042, "grad_norm": 2.167968511581421, "learning_rate": 1.835663689277685e-05, "loss": 0.1753, "num_input_tokens_seen": 4599776, "step": 24130 }, { "epoch": 12.544178794178794, "grad_norm": 0.17587070167064667, "learning_rate": 1.8345706065899315e-05, "loss": 0.1219, "num_input_tokens_seen": 4600736, "step": 24135 }, { "epoch": 12.546777546777546, "grad_norm": 0.23641343414783478, "learning_rate": 1.8334776607968622e-05, "loss": 0.1844, "num_input_tokens_seen": 4601696, "step": 24140 }, { "epoch": 12.549376299376299, "grad_norm": 0.426349937915802, "learning_rate": 1.8323848521233217e-05, "loss": 0.1707, "num_input_tokens_seen": 4602624, "step": 24145 }, { "epoch": 12.551975051975052, "grad_norm": 0.7242111563682556, "learning_rate": 1.831292180794125e-05, "loss": 0.2491, "num_input_tokens_seen": 4603616, "step": 24150 }, { "epoch": 12.554573804573804, "grad_norm": 0.19286423921585083, "learning_rate": 1.8301996470340632e-05, "loss": 0.0767, "num_input_tokens_seen": 4604576, "step": 24155 }, { "epoch": 12.557172557172557, "grad_norm": 1.182722568511963, "learning_rate": 1.8291072510678947e-05, "loss": 0.1565, "num_input_tokens_seen": 4605504, "step": 24160 }, { "epoch": 12.55977130977131, "grad_norm": 1.2818324565887451, "learning_rate": 1.828014993120352e-05, "loss": 0.1885, "num_input_tokens_seen": 4606464, "step": 24165 }, { "epoch": 12.562370062370062, "grad_norm": 0.7500380873680115, "learning_rate": 1.8269228734161387e-05, "loss": 0.1274, "num_input_tokens_seen": 4607488, "step": 24170 }, { "epoch": 12.564968814968815, "grad_norm": 1.1575433015823364, "learning_rate": 1.8258308921799298e-05, "loss": 0.2657, "num_input_tokens_seen": 4608384, "step": 24175 }, { "epoch": 12.567567567567568, "grad_norm": 0.6125503182411194, "learning_rate": 1.8247390496363713e-05, "loss": 0.2349, "num_input_tokens_seen": 4609312, "step": 24180 }, { "epoch": 12.57016632016632, "grad_norm": 0.29885491728782654, "learning_rate": 1.8236473460100823e-05, "loss": 0.1062, "num_input_tokens_seen": 4610208, "step": 24185 }, { "epoch": 12.572765072765073, "grad_norm": 0.42977258563041687, "learning_rate": 1.8225557815256513e-05, "loss": 0.0715, "num_input_tokens_seen": 4611264, "step": 24190 }, { "epoch": 12.575363825363825, "grad_norm": 0.9180232286453247, "learning_rate": 1.82146435640764e-05, "loss": 0.1437, "num_input_tokens_seen": 4612224, "step": 24195 }, { "epoch": 12.577962577962579, "grad_norm": 0.7156871557235718, "learning_rate": 1.8203730708805795e-05, "loss": 0.1353, "num_input_tokens_seen": 4613184, "step": 24200 }, { "epoch": 12.58056133056133, "grad_norm": 0.4379197359085083, "learning_rate": 1.8192819251689743e-05, "loss": 0.2067, "num_input_tokens_seen": 4614144, "step": 24205 }, { "epoch": 12.583160083160083, "grad_norm": 0.14208079874515533, "learning_rate": 1.8181909194972992e-05, "loss": 0.1323, "num_input_tokens_seen": 4615136, "step": 24210 }, { "epoch": 12.585758835758837, "grad_norm": 0.5140554904937744, "learning_rate": 1.8171000540899986e-05, "loss": 0.0584, "num_input_tokens_seen": 4616032, "step": 24215 }, { "epoch": 12.588357588357589, "grad_norm": 0.8588868379592896, "learning_rate": 1.8160093291714905e-05, "loss": 0.1013, "num_input_tokens_seen": 4617056, "step": 24220 }, { "epoch": 12.59095634095634, "grad_norm": 0.14466729760169983, "learning_rate": 1.8149187449661626e-05, "loss": 0.0239, "num_input_tokens_seen": 4617920, "step": 24225 }, { "epoch": 12.593555093555093, "grad_norm": 0.27593937516212463, "learning_rate": 1.8138283016983748e-05, "loss": 0.1108, "num_input_tokens_seen": 4618944, "step": 24230 }, { "epoch": 12.596153846153847, "grad_norm": 0.40984857082366943, "learning_rate": 1.812737999592456e-05, "loss": 0.089, "num_input_tokens_seen": 4619840, "step": 24235 }, { "epoch": 12.598752598752599, "grad_norm": 0.6660204529762268, "learning_rate": 1.8116478388727084e-05, "loss": 0.0706, "num_input_tokens_seen": 4620800, "step": 24240 }, { "epoch": 12.60135135135135, "grad_norm": 1.2973908185958862, "learning_rate": 1.8105578197634042e-05, "loss": 0.1292, "num_input_tokens_seen": 4621728, "step": 24245 }, { "epoch": 12.603950103950105, "grad_norm": 1.1891745328903198, "learning_rate": 1.8094679424887854e-05, "loss": 0.2149, "num_input_tokens_seen": 4622656, "step": 24250 }, { "epoch": 12.606548856548857, "grad_norm": 1.2103939056396484, "learning_rate": 1.8083782072730666e-05, "loss": 0.1682, "num_input_tokens_seen": 4623680, "step": 24255 }, { "epoch": 12.609147609147609, "grad_norm": 1.4331259727478027, "learning_rate": 1.8072886143404317e-05, "loss": 0.0961, "num_input_tokens_seen": 4624640, "step": 24260 }, { "epoch": 12.611746361746361, "grad_norm": 0.20302070677280426, "learning_rate": 1.806199163915036e-05, "loss": 0.0432, "num_input_tokens_seen": 4625568, "step": 24265 }, { "epoch": 12.614345114345115, "grad_norm": 0.2532273232936859, "learning_rate": 1.8051098562210057e-05, "loss": 0.0766, "num_input_tokens_seen": 4626528, "step": 24270 }, { "epoch": 12.616943866943867, "grad_norm": 0.6358741521835327, "learning_rate": 1.804020691482438e-05, "loss": 0.0918, "num_input_tokens_seen": 4627456, "step": 24275 }, { "epoch": 12.619542619542619, "grad_norm": 0.10285945236682892, "learning_rate": 1.8029316699233983e-05, "loss": 0.0213, "num_input_tokens_seen": 4628384, "step": 24280 }, { "epoch": 12.622141372141373, "grad_norm": 0.10635688155889511, "learning_rate": 1.801842791767925e-05, "loss": 0.1096, "num_input_tokens_seen": 4629376, "step": 24285 }, { "epoch": 12.624740124740125, "grad_norm": 0.23810185492038727, "learning_rate": 1.800754057240027e-05, "loss": 0.1351, "num_input_tokens_seen": 4630304, "step": 24290 }, { "epoch": 12.627338877338877, "grad_norm": 1.0890116691589355, "learning_rate": 1.799665466563682e-05, "loss": 0.2517, "num_input_tokens_seen": 4631264, "step": 24295 }, { "epoch": 12.62993762993763, "grad_norm": 0.6968731880187988, "learning_rate": 1.7985770199628397e-05, "loss": 0.0777, "num_input_tokens_seen": 4632256, "step": 24300 }, { "epoch": 12.632536382536383, "grad_norm": 0.41070425510406494, "learning_rate": 1.7974887176614193e-05, "loss": 0.0623, "num_input_tokens_seen": 4633280, "step": 24305 }, { "epoch": 12.635135135135135, "grad_norm": 0.10410795360803604, "learning_rate": 1.796400559883311e-05, "loss": 0.0869, "num_input_tokens_seen": 4634176, "step": 24310 }, { "epoch": 12.637733887733887, "grad_norm": 0.7171242237091064, "learning_rate": 1.7953125468523736e-05, "loss": 0.0944, "num_input_tokens_seen": 4635104, "step": 24315 }, { "epoch": 12.640332640332641, "grad_norm": 1.1927279233932495, "learning_rate": 1.7942246787924384e-05, "loss": 0.2048, "num_input_tokens_seen": 4636128, "step": 24320 }, { "epoch": 12.642931392931393, "grad_norm": 0.9980327486991882, "learning_rate": 1.7931369559273047e-05, "loss": 0.1967, "num_input_tokens_seen": 4637024, "step": 24325 }, { "epoch": 12.645530145530145, "grad_norm": 0.9104196429252625, "learning_rate": 1.7920493784807444e-05, "loss": 0.1662, "num_input_tokens_seen": 4638016, "step": 24330 }, { "epoch": 12.648128898128899, "grad_norm": 1.4351710081100464, "learning_rate": 1.7909619466764964e-05, "loss": 0.1192, "num_input_tokens_seen": 4638912, "step": 24335 }, { "epoch": 12.650727650727651, "grad_norm": 1.9659945964813232, "learning_rate": 1.789874660738274e-05, "loss": 0.1801, "num_input_tokens_seen": 4639840, "step": 24340 }, { "epoch": 12.653326403326403, "grad_norm": 0.7598546743392944, "learning_rate": 1.7887875208897556e-05, "loss": 0.149, "num_input_tokens_seen": 4640736, "step": 24345 }, { "epoch": 12.655925155925155, "grad_norm": 1.869304895401001, "learning_rate": 1.7877005273545915e-05, "loss": 0.2017, "num_input_tokens_seen": 4641600, "step": 24350 }, { "epoch": 12.65852390852391, "grad_norm": 0.9812638759613037, "learning_rate": 1.7866136803564036e-05, "loss": 0.176, "num_input_tokens_seen": 4642656, "step": 24355 }, { "epoch": 12.661122661122661, "grad_norm": 0.7075117826461792, "learning_rate": 1.7855269801187814e-05, "loss": 0.1372, "num_input_tokens_seen": 4643552, "step": 24360 }, { "epoch": 12.663721413721413, "grad_norm": 0.05717301368713379, "learning_rate": 1.784440426865285e-05, "loss": 0.0741, "num_input_tokens_seen": 4644480, "step": 24365 }, { "epoch": 12.666320166320165, "grad_norm": 0.7746748328208923, "learning_rate": 1.783354020819445e-05, "loss": 0.0867, "num_input_tokens_seen": 4645408, "step": 24370 }, { "epoch": 12.66891891891892, "grad_norm": 0.09199653565883636, "learning_rate": 1.78226776220476e-05, "loss": 0.0559, "num_input_tokens_seen": 4646368, "step": 24375 }, { "epoch": 12.671517671517671, "grad_norm": 0.2736608386039734, "learning_rate": 1.7811816512447006e-05, "loss": 0.1539, "num_input_tokens_seen": 4647296, "step": 24380 }, { "epoch": 12.674116424116423, "grad_norm": 0.5065760016441345, "learning_rate": 1.7800956881627044e-05, "loss": 0.0781, "num_input_tokens_seen": 4648256, "step": 24385 }, { "epoch": 12.676715176715177, "grad_norm": 1.2491059303283691, "learning_rate": 1.779009873182181e-05, "loss": 0.1125, "num_input_tokens_seen": 4649248, "step": 24390 }, { "epoch": 12.67931392931393, "grad_norm": 0.38885655999183655, "learning_rate": 1.7779242065265073e-05, "loss": 0.0514, "num_input_tokens_seen": 4650208, "step": 24395 }, { "epoch": 12.681912681912682, "grad_norm": 0.14025981724262238, "learning_rate": 1.776838688419032e-05, "loss": 0.0694, "num_input_tokens_seen": 4651168, "step": 24400 }, { "epoch": 12.684511434511435, "grad_norm": 0.9142972230911255, "learning_rate": 1.7757533190830717e-05, "loss": 0.1254, "num_input_tokens_seen": 4652064, "step": 24405 }, { "epoch": 12.687110187110187, "grad_norm": 0.8751917481422424, "learning_rate": 1.7746680987419113e-05, "loss": 0.2003, "num_input_tokens_seen": 4652992, "step": 24410 }, { "epoch": 12.68970893970894, "grad_norm": 0.1832103580236435, "learning_rate": 1.7735830276188085e-05, "loss": 0.1665, "num_input_tokens_seen": 4654016, "step": 24415 }, { "epoch": 12.692307692307692, "grad_norm": 1.204201102256775, "learning_rate": 1.7724981059369866e-05, "loss": 0.1966, "num_input_tokens_seen": 4655040, "step": 24420 }, { "epoch": 12.694906444906445, "grad_norm": 0.540104866027832, "learning_rate": 1.7714133339196415e-05, "loss": 0.1751, "num_input_tokens_seen": 4656032, "step": 24425 }, { "epoch": 12.697505197505198, "grad_norm": 0.26496565341949463, "learning_rate": 1.770328711789935e-05, "loss": 0.0951, "num_input_tokens_seen": 4657024, "step": 24430 }, { "epoch": 12.70010395010395, "grad_norm": 0.622905969619751, "learning_rate": 1.7692442397710014e-05, "loss": 0.1229, "num_input_tokens_seen": 4658048, "step": 24435 }, { "epoch": 12.702702702702704, "grad_norm": 0.20612439513206482, "learning_rate": 1.7681599180859404e-05, "loss": 0.0628, "num_input_tokens_seen": 4659168, "step": 24440 }, { "epoch": 12.705301455301456, "grad_norm": 1.6069424152374268, "learning_rate": 1.767075746957825e-05, "loss": 0.1095, "num_input_tokens_seen": 4660128, "step": 24445 }, { "epoch": 12.707900207900208, "grad_norm": 1.0049830675125122, "learning_rate": 1.7659917266096935e-05, "loss": 0.1987, "num_input_tokens_seen": 4661024, "step": 24450 }, { "epoch": 12.71049896049896, "grad_norm": 0.5721873044967651, "learning_rate": 1.7649078572645555e-05, "loss": 0.0796, "num_input_tokens_seen": 4661984, "step": 24455 }, { "epoch": 12.713097713097714, "grad_norm": 0.8328849673271179, "learning_rate": 1.763824139145388e-05, "loss": 0.11, "num_input_tokens_seen": 4662976, "step": 24460 }, { "epoch": 12.715696465696466, "grad_norm": 0.8985901474952698, "learning_rate": 1.762740572475139e-05, "loss": 0.1038, "num_input_tokens_seen": 4663936, "step": 24465 }, { "epoch": 12.718295218295218, "grad_norm": 0.5370491147041321, "learning_rate": 1.761657157476723e-05, "loss": 0.1337, "num_input_tokens_seen": 4664800, "step": 24470 }, { "epoch": 12.720893970893972, "grad_norm": 0.9881789684295654, "learning_rate": 1.760573894373024e-05, "loss": 0.1292, "num_input_tokens_seen": 4665792, "step": 24475 }, { "epoch": 12.723492723492724, "grad_norm": 0.9664934873580933, "learning_rate": 1.7594907833868957e-05, "loss": 0.1882, "num_input_tokens_seen": 4666720, "step": 24480 }, { "epoch": 12.726091476091476, "grad_norm": 0.4549119770526886, "learning_rate": 1.7584078247411588e-05, "loss": 0.1368, "num_input_tokens_seen": 4667648, "step": 24485 }, { "epoch": 12.728690228690228, "grad_norm": 0.9168290495872498, "learning_rate": 1.7573250186586047e-05, "loss": 0.1139, "num_input_tokens_seen": 4668512, "step": 24490 }, { "epoch": 12.731288981288982, "grad_norm": 0.410892128944397, "learning_rate": 1.756242365361993e-05, "loss": 0.0422, "num_input_tokens_seen": 4669408, "step": 24495 }, { "epoch": 12.733887733887734, "grad_norm": 0.22889754176139832, "learning_rate": 1.7551598650740496e-05, "loss": 0.1413, "num_input_tokens_seen": 4670336, "step": 24500 }, { "epoch": 12.736486486486486, "grad_norm": 0.6494428515434265, "learning_rate": 1.7540775180174717e-05, "loss": 0.1223, "num_input_tokens_seen": 4671328, "step": 24505 }, { "epoch": 12.73908523908524, "grad_norm": 0.5659309029579163, "learning_rate": 1.752995324414924e-05, "loss": 0.0668, "num_input_tokens_seen": 4672352, "step": 24510 }, { "epoch": 12.741683991683992, "grad_norm": 0.23858313262462616, "learning_rate": 1.751913284489039e-05, "loss": 0.0949, "num_input_tokens_seen": 4673344, "step": 24515 }, { "epoch": 12.744282744282744, "grad_norm": 0.13925766944885254, "learning_rate": 1.7508313984624178e-05, "loss": 0.1264, "num_input_tokens_seen": 4674336, "step": 24520 }, { "epoch": 12.746881496881496, "grad_norm": 0.7719894051551819, "learning_rate": 1.7497496665576313e-05, "loss": 0.2656, "num_input_tokens_seen": 4675296, "step": 24525 }, { "epoch": 12.74948024948025, "grad_norm": 0.5382145047187805, "learning_rate": 1.7486680889972163e-05, "loss": 0.2528, "num_input_tokens_seen": 4676352, "step": 24530 }, { "epoch": 12.752079002079002, "grad_norm": 0.26441502571105957, "learning_rate": 1.7475866660036804e-05, "loss": 0.1707, "num_input_tokens_seen": 4677248, "step": 24535 }, { "epoch": 12.754677754677754, "grad_norm": 0.7724231481552124, "learning_rate": 1.7465053977994972e-05, "loss": 0.0914, "num_input_tokens_seen": 4678240, "step": 24540 }, { "epoch": 12.757276507276508, "grad_norm": 0.23500488698482513, "learning_rate": 1.7454242846071085e-05, "loss": 0.1143, "num_input_tokens_seen": 4679200, "step": 24545 }, { "epoch": 12.75987525987526, "grad_norm": 0.1656576544046402, "learning_rate": 1.7443433266489267e-05, "loss": 0.1489, "num_input_tokens_seen": 4680128, "step": 24550 }, { "epoch": 12.762474012474012, "grad_norm": 1.111168622970581, "learning_rate": 1.7432625241473293e-05, "loss": 0.1597, "num_input_tokens_seen": 4681152, "step": 24555 }, { "epoch": 12.765072765072766, "grad_norm": 0.6024618744850159, "learning_rate": 1.7421818773246642e-05, "loss": 0.2045, "num_input_tokens_seen": 4682048, "step": 24560 }, { "epoch": 12.767671517671518, "grad_norm": 0.14460578560829163, "learning_rate": 1.7411013864032453e-05, "loss": 0.0988, "num_input_tokens_seen": 4683008, "step": 24565 }, { "epoch": 12.77027027027027, "grad_norm": 0.9007580876350403, "learning_rate": 1.740021051605356e-05, "loss": 0.1138, "num_input_tokens_seen": 4683968, "step": 24570 }, { "epoch": 12.772869022869022, "grad_norm": 0.037211764603853226, "learning_rate": 1.7389408731532465e-05, "loss": 0.2343, "num_input_tokens_seen": 4684992, "step": 24575 }, { "epoch": 12.775467775467776, "grad_norm": 0.8680539131164551, "learning_rate": 1.737860851269136e-05, "loss": 0.0548, "num_input_tokens_seen": 4686016, "step": 24580 }, { "epoch": 12.778066528066528, "grad_norm": 0.4909765422344208, "learning_rate": 1.7367809861752094e-05, "loss": 0.0924, "num_input_tokens_seen": 4686976, "step": 24585 }, { "epoch": 12.78066528066528, "grad_norm": 0.6272005438804626, "learning_rate": 1.7357012780936218e-05, "loss": 0.1647, "num_input_tokens_seen": 4687936, "step": 24590 }, { "epoch": 12.783264033264032, "grad_norm": 0.7473691701889038, "learning_rate": 1.7346217272464942e-05, "loss": 0.079, "num_input_tokens_seen": 4688864, "step": 24595 }, { "epoch": 12.785862785862786, "grad_norm": 0.2867790460586548, "learning_rate": 1.7335423338559174e-05, "loss": 0.076, "num_input_tokens_seen": 4689888, "step": 24600 }, { "epoch": 12.788461538461538, "grad_norm": 0.21747107803821564, "learning_rate": 1.7324630981439467e-05, "loss": 0.0529, "num_input_tokens_seen": 4690816, "step": 24605 }, { "epoch": 12.79106029106029, "grad_norm": 0.8570411801338196, "learning_rate": 1.731384020332606e-05, "loss": 0.0735, "num_input_tokens_seen": 4691744, "step": 24610 }, { "epoch": 12.793659043659044, "grad_norm": 0.12855307757854462, "learning_rate": 1.730305100643889e-05, "loss": 0.1447, "num_input_tokens_seen": 4692672, "step": 24615 }, { "epoch": 12.796257796257796, "grad_norm": 0.4187091290950775, "learning_rate": 1.7292263392997547e-05, "loss": 0.1075, "num_input_tokens_seen": 4693664, "step": 24620 }, { "epoch": 12.798856548856548, "grad_norm": 0.3417571485042572, "learning_rate": 1.72814773652213e-05, "loss": 0.1448, "num_input_tokens_seen": 4694592, "step": 24625 }, { "epoch": 12.801455301455302, "grad_norm": 0.9246416091918945, "learning_rate": 1.727069292532909e-05, "loss": 0.2332, "num_input_tokens_seen": 4695552, "step": 24630 }, { "epoch": 12.804054054054054, "grad_norm": 0.1532185971736908, "learning_rate": 1.725991007553953e-05, "loss": 0.0889, "num_input_tokens_seen": 4696448, "step": 24635 }, { "epoch": 12.806652806652806, "grad_norm": 0.41856786608695984, "learning_rate": 1.7249128818070923e-05, "loss": 0.1025, "num_input_tokens_seen": 4697472, "step": 24640 }, { "epoch": 12.809251559251559, "grad_norm": 0.18541310727596283, "learning_rate": 1.7238349155141214e-05, "loss": 0.1356, "num_input_tokens_seen": 4698400, "step": 24645 }, { "epoch": 12.811850311850312, "grad_norm": 0.5486729145050049, "learning_rate": 1.7227571088968046e-05, "loss": 0.1909, "num_input_tokens_seen": 4699296, "step": 24650 }, { "epoch": 12.814449064449065, "grad_norm": 0.7878041863441467, "learning_rate": 1.721679462176872e-05, "loss": 0.171, "num_input_tokens_seen": 4700224, "step": 24655 }, { "epoch": 12.817047817047817, "grad_norm": 0.8602266311645508, "learning_rate": 1.7206019755760217e-05, "loss": 0.2033, "num_input_tokens_seen": 4701152, "step": 24660 }, { "epoch": 12.81964656964657, "grad_norm": 0.16755183041095734, "learning_rate": 1.719524649315919e-05, "loss": 0.1593, "num_input_tokens_seen": 4702112, "step": 24665 }, { "epoch": 12.822245322245323, "grad_norm": 0.1391330361366272, "learning_rate": 1.718447483618193e-05, "loss": 0.1026, "num_input_tokens_seen": 4703040, "step": 24670 }, { "epoch": 12.824844074844075, "grad_norm": 0.8350552916526794, "learning_rate": 1.7173704787044446e-05, "loss": 0.1923, "num_input_tokens_seen": 4704000, "step": 24675 }, { "epoch": 12.827442827442827, "grad_norm": 0.39034536480903625, "learning_rate": 1.7162936347962388e-05, "loss": 0.1165, "num_input_tokens_seen": 4704960, "step": 24680 }, { "epoch": 12.83004158004158, "grad_norm": 0.9935715794563293, "learning_rate": 1.7152169521151078e-05, "loss": 0.1281, "num_input_tokens_seen": 4705888, "step": 24685 }, { "epoch": 12.832640332640333, "grad_norm": 0.6079493761062622, "learning_rate": 1.714140430882551e-05, "loss": 0.1417, "num_input_tokens_seen": 4706912, "step": 24690 }, { "epoch": 12.835239085239085, "grad_norm": 0.13857893645763397, "learning_rate": 1.7130640713200353e-05, "loss": 0.0444, "num_input_tokens_seen": 4707808, "step": 24695 }, { "epoch": 12.837837837837839, "grad_norm": 0.505126953125, "learning_rate": 1.7119878736489918e-05, "loss": 0.2378, "num_input_tokens_seen": 4708768, "step": 24700 }, { "epoch": 12.84043659043659, "grad_norm": 1.2208069562911987, "learning_rate": 1.710911838090822e-05, "loss": 0.115, "num_input_tokens_seen": 4709728, "step": 24705 }, { "epoch": 12.843035343035343, "grad_norm": 0.5555828809738159, "learning_rate": 1.7098359648668903e-05, "loss": 0.1224, "num_input_tokens_seen": 4710720, "step": 24710 }, { "epoch": 12.845634095634095, "grad_norm": 0.2760359048843384, "learning_rate": 1.7087602541985308e-05, "loss": 0.0519, "num_input_tokens_seen": 4711680, "step": 24715 }, { "epoch": 12.848232848232849, "grad_norm": 0.22615566849708557, "learning_rate": 1.7076847063070416e-05, "loss": 0.0968, "num_input_tokens_seen": 4712576, "step": 24720 }, { "epoch": 12.8508316008316, "grad_norm": 1.1378798484802246, "learning_rate": 1.706609321413689e-05, "loss": 0.2736, "num_input_tokens_seen": 4713536, "step": 24725 }, { "epoch": 12.853430353430353, "grad_norm": 0.23964574933052063, "learning_rate": 1.7055340997397072e-05, "loss": 0.126, "num_input_tokens_seen": 4714560, "step": 24730 }, { "epoch": 12.856029106029107, "grad_norm": 0.35788601636886597, "learning_rate": 1.7044590415062918e-05, "loss": 0.1256, "num_input_tokens_seen": 4715520, "step": 24735 }, { "epoch": 12.858627858627859, "grad_norm": 0.6170722246170044, "learning_rate": 1.703384146934609e-05, "loss": 0.1762, "num_input_tokens_seen": 4716384, "step": 24740 }, { "epoch": 12.861226611226611, "grad_norm": 1.3016942739486694, "learning_rate": 1.702309416245791e-05, "loss": 0.197, "num_input_tokens_seen": 4717376, "step": 24745 }, { "epoch": 12.863825363825363, "grad_norm": 0.16767537593841553, "learning_rate": 1.701234849660935e-05, "loss": 0.0613, "num_input_tokens_seen": 4718368, "step": 24750 }, { "epoch": 12.866424116424117, "grad_norm": 0.6436236500740051, "learning_rate": 1.7001604474011053e-05, "loss": 0.1541, "num_input_tokens_seen": 4719328, "step": 24755 }, { "epoch": 12.869022869022869, "grad_norm": 0.7015265822410583, "learning_rate": 1.6990862096873304e-05, "loss": 0.1326, "num_input_tokens_seen": 4720224, "step": 24760 }, { "epoch": 12.871621621621621, "grad_norm": 0.2290751188993454, "learning_rate": 1.6980121367406092e-05, "loss": 0.0958, "num_input_tokens_seen": 4721120, "step": 24765 }, { "epoch": 12.874220374220375, "grad_norm": 0.6258598566055298, "learning_rate": 1.6969382287819025e-05, "loss": 0.0852, "num_input_tokens_seen": 4722080, "step": 24770 }, { "epoch": 12.876819126819127, "grad_norm": 0.264247328042984, "learning_rate": 1.6958644860321394e-05, "loss": 0.1387, "num_input_tokens_seen": 4723040, "step": 24775 }, { "epoch": 12.879417879417879, "grad_norm": 0.7198121547698975, "learning_rate": 1.694790908712213e-05, "loss": 0.1573, "num_input_tokens_seen": 4723968, "step": 24780 }, { "epoch": 12.882016632016633, "grad_norm": 0.1853436380624771, "learning_rate": 1.693717497042986e-05, "loss": 0.0462, "num_input_tokens_seen": 4724896, "step": 24785 }, { "epoch": 12.884615384615385, "grad_norm": 1.2238922119140625, "learning_rate": 1.6926442512452825e-05, "loss": 0.2151, "num_input_tokens_seen": 4725824, "step": 24790 }, { "epoch": 12.887214137214137, "grad_norm": 0.7401494383811951, "learning_rate": 1.691571171539897e-05, "loss": 0.1246, "num_input_tokens_seen": 4726688, "step": 24795 }, { "epoch": 12.88981288981289, "grad_norm": 0.1912757009267807, "learning_rate": 1.690498258147586e-05, "loss": 0.107, "num_input_tokens_seen": 4727616, "step": 24800 }, { "epoch": 12.892411642411643, "grad_norm": 0.5602843165397644, "learning_rate": 1.689425511289073e-05, "loss": 0.1429, "num_input_tokens_seen": 4728576, "step": 24805 }, { "epoch": 12.895010395010395, "grad_norm": 0.21412983536720276, "learning_rate": 1.688352931185049e-05, "loss": 0.0539, "num_input_tokens_seen": 4729536, "step": 24810 }, { "epoch": 12.897609147609147, "grad_norm": 0.7173550128936768, "learning_rate": 1.687280518056168e-05, "loss": 0.0976, "num_input_tokens_seen": 4730528, "step": 24815 }, { "epoch": 12.9002079002079, "grad_norm": 0.8620652556419373, "learning_rate": 1.686208272123052e-05, "loss": 0.095, "num_input_tokens_seen": 4731584, "step": 24820 }, { "epoch": 12.902806652806653, "grad_norm": 0.8905783295631409, "learning_rate": 1.6851361936062865e-05, "loss": 0.0849, "num_input_tokens_seen": 4732480, "step": 24825 }, { "epoch": 12.905405405405405, "grad_norm": 1.2473853826522827, "learning_rate": 1.6840642827264246e-05, "loss": 0.275, "num_input_tokens_seen": 4733408, "step": 24830 }, { "epoch": 12.908004158004157, "grad_norm": 0.8166947364807129, "learning_rate": 1.682992539703983e-05, "loss": 0.155, "num_input_tokens_seen": 4734432, "step": 24835 }, { "epoch": 12.910602910602911, "grad_norm": 0.6299273371696472, "learning_rate": 1.6819209647594457e-05, "loss": 0.1465, "num_input_tokens_seen": 4735392, "step": 24840 }, { "epoch": 12.913201663201663, "grad_norm": 0.22026236355304718, "learning_rate": 1.6808495581132605e-05, "loss": 0.0802, "num_input_tokens_seen": 4736320, "step": 24845 }, { "epoch": 12.915800415800415, "grad_norm": 1.2410614490509033, "learning_rate": 1.6797783199858414e-05, "loss": 0.1329, "num_input_tokens_seen": 4737280, "step": 24850 }, { "epoch": 12.91839916839917, "grad_norm": 0.20133677124977112, "learning_rate": 1.678707250597568e-05, "loss": 0.1072, "num_input_tokens_seen": 4738208, "step": 24855 }, { "epoch": 12.920997920997921, "grad_norm": 1.2814449071884155, "learning_rate": 1.6776363501687853e-05, "loss": 0.169, "num_input_tokens_seen": 4739168, "step": 24860 }, { "epoch": 12.923596673596673, "grad_norm": 0.27948105335235596, "learning_rate": 1.6765656189198013e-05, "loss": 0.0904, "num_input_tokens_seen": 4740032, "step": 24865 }, { "epoch": 12.926195426195425, "grad_norm": 0.37102246284484863, "learning_rate": 1.675495057070892e-05, "loss": 0.0696, "num_input_tokens_seen": 4740960, "step": 24870 }, { "epoch": 12.92879417879418, "grad_norm": 1.207871437072754, "learning_rate": 1.6744246648422973e-05, "loss": 0.0962, "num_input_tokens_seen": 4741888, "step": 24875 }, { "epoch": 12.931392931392931, "grad_norm": 0.8888381719589233, "learning_rate": 1.673354442454223e-05, "loss": 0.2398, "num_input_tokens_seen": 4742880, "step": 24880 }, { "epoch": 12.933991683991684, "grad_norm": 0.18933433294296265, "learning_rate": 1.672284390126838e-05, "loss": 0.0909, "num_input_tokens_seen": 4743968, "step": 24885 }, { "epoch": 12.936590436590437, "grad_norm": 1.0907542705535889, "learning_rate": 1.6712145080802795e-05, "loss": 0.1716, "num_input_tokens_seen": 4744928, "step": 24890 }, { "epoch": 12.93918918918919, "grad_norm": 0.12648101150989532, "learning_rate": 1.6701447965346456e-05, "loss": 0.1389, "num_input_tokens_seen": 4745792, "step": 24895 }, { "epoch": 12.941787941787942, "grad_norm": 0.28872328996658325, "learning_rate": 1.6690752557100032e-05, "loss": 0.0982, "num_input_tokens_seen": 4746752, "step": 24900 }, { "epoch": 12.944386694386694, "grad_norm": 0.14280560612678528, "learning_rate": 1.668005885826381e-05, "loss": 0.1366, "num_input_tokens_seen": 4747680, "step": 24905 }, { "epoch": 12.946985446985448, "grad_norm": 1.0995277166366577, "learning_rate": 1.6669366871037746e-05, "loss": 0.1774, "num_input_tokens_seen": 4748672, "step": 24910 }, { "epoch": 12.9495841995842, "grad_norm": 1.2879278659820557, "learning_rate": 1.6658676597621438e-05, "loss": 0.1166, "num_input_tokens_seen": 4749536, "step": 24915 }, { "epoch": 12.952182952182952, "grad_norm": 0.8846889138221741, "learning_rate": 1.6647988040214127e-05, "loss": 0.3127, "num_input_tokens_seen": 4750496, "step": 24920 }, { "epoch": 12.954781704781706, "grad_norm": 0.025853648781776428, "learning_rate": 1.6637301201014698e-05, "loss": 0.0852, "num_input_tokens_seen": 4751392, "step": 24925 }, { "epoch": 12.957380457380458, "grad_norm": 2.666607618331909, "learning_rate": 1.662661608222171e-05, "loss": 0.1366, "num_input_tokens_seen": 4752416, "step": 24930 }, { "epoch": 12.95997920997921, "grad_norm": 0.7444571256637573, "learning_rate": 1.6615932686033325e-05, "loss": 0.1032, "num_input_tokens_seen": 4753312, "step": 24935 }, { "epoch": 12.962577962577962, "grad_norm": 1.1135828495025635, "learning_rate": 1.6605251014647372e-05, "loss": 0.1225, "num_input_tokens_seen": 4754240, "step": 24940 }, { "epoch": 12.965176715176716, "grad_norm": 0.478823184967041, "learning_rate": 1.6594571070261338e-05, "loss": 0.0973, "num_input_tokens_seen": 4755232, "step": 24945 }, { "epoch": 12.967775467775468, "grad_norm": 0.1594957411289215, "learning_rate": 1.658389285507233e-05, "loss": 0.1364, "num_input_tokens_seen": 4756096, "step": 24950 }, { "epoch": 12.97037422037422, "grad_norm": 0.6268921494483948, "learning_rate": 1.6573216371277122e-05, "loss": 0.1099, "num_input_tokens_seen": 4757088, "step": 24955 }, { "epoch": 12.972972972972974, "grad_norm": 0.20740360021591187, "learning_rate": 1.6562541621072114e-05, "loss": 0.1389, "num_input_tokens_seen": 4757984, "step": 24960 }, { "epoch": 12.975571725571726, "grad_norm": 0.12246762961149216, "learning_rate": 1.655186860665336e-05, "loss": 0.0998, "num_input_tokens_seen": 4758944, "step": 24965 }, { "epoch": 12.978170478170478, "grad_norm": 0.15698693692684174, "learning_rate": 1.6541197330216552e-05, "loss": 0.0472, "num_input_tokens_seen": 4759904, "step": 24970 }, { "epoch": 12.98076923076923, "grad_norm": 0.06670878082513809, "learning_rate": 1.6530527793957023e-05, "loss": 0.1151, "num_input_tokens_seen": 4760896, "step": 24975 }, { "epoch": 12.983367983367984, "grad_norm": 1.2767577171325684, "learning_rate": 1.651986000006976e-05, "loss": 0.1187, "num_input_tokens_seen": 4761792, "step": 24980 }, { "epoch": 12.985966735966736, "grad_norm": 1.993733525276184, "learning_rate": 1.650919395074937e-05, "loss": 0.2494, "num_input_tokens_seen": 4762752, "step": 24985 }, { "epoch": 12.988565488565488, "grad_norm": 0.6280341148376465, "learning_rate": 1.6498529648190124e-05, "loss": 0.1119, "num_input_tokens_seen": 4763616, "step": 24990 }, { "epoch": 12.991164241164242, "grad_norm": 0.6023305058479309, "learning_rate": 1.6487867094585928e-05, "loss": 0.1919, "num_input_tokens_seen": 4764640, "step": 24995 }, { "epoch": 12.993762993762994, "grad_norm": 0.22085736691951752, "learning_rate": 1.6477206292130305e-05, "loss": 0.0911, "num_input_tokens_seen": 4765568, "step": 25000 }, { "epoch": 12.996361746361746, "grad_norm": 0.4638887941837311, "learning_rate": 1.6466547243016447e-05, "loss": 0.1401, "num_input_tokens_seen": 4766528, "step": 25005 }, { "epoch": 12.9989604989605, "grad_norm": 0.11123653501272202, "learning_rate": 1.645588994943717e-05, "loss": 0.1447, "num_input_tokens_seen": 4767456, "step": 25010 }, { "epoch": 13.0, "eval_loss": 0.14763543009757996, "eval_runtime": 8.0101, "eval_samples_per_second": 106.866, "eval_steps_per_second": 26.716, "num_input_tokens_seen": 4767736, "step": 25012 }, { "epoch": 13.001559251559252, "grad_norm": 0.835787296295166, "learning_rate": 1.6445234413584944e-05, "loss": 0.0792, "num_input_tokens_seen": 4768344, "step": 25015 }, { "epoch": 13.004158004158004, "grad_norm": 1.5029054880142212, "learning_rate": 1.6434580637651846e-05, "loss": 0.1826, "num_input_tokens_seen": 4769272, "step": 25020 }, { "epoch": 13.006756756756756, "grad_norm": 1.115305781364441, "learning_rate": 1.642392862382964e-05, "loss": 0.1885, "num_input_tokens_seen": 4770232, "step": 25025 }, { "epoch": 13.00935550935551, "grad_norm": 0.11509131640195847, "learning_rate": 1.641327837430967e-05, "loss": 0.0492, "num_input_tokens_seen": 4771192, "step": 25030 }, { "epoch": 13.011954261954262, "grad_norm": 0.4510165750980377, "learning_rate": 1.6402629891282965e-05, "loss": 0.275, "num_input_tokens_seen": 4772088, "step": 25035 }, { "epoch": 13.014553014553014, "grad_norm": 0.8761131167411804, "learning_rate": 1.6391983176940163e-05, "loss": 0.0552, "num_input_tokens_seen": 4772984, "step": 25040 }, { "epoch": 13.017151767151768, "grad_norm": 1.6291849613189697, "learning_rate": 1.638133823347156e-05, "loss": 0.0911, "num_input_tokens_seen": 4773976, "step": 25045 }, { "epoch": 13.01975051975052, "grad_norm": 0.3179493248462677, "learning_rate": 1.6370695063067053e-05, "loss": 0.086, "num_input_tokens_seen": 4774936, "step": 25050 }, { "epoch": 13.022349272349272, "grad_norm": 1.174198031425476, "learning_rate": 1.6360053667916217e-05, "loss": 0.3089, "num_input_tokens_seen": 4775960, "step": 25055 }, { "epoch": 13.024948024948024, "grad_norm": 0.17184069752693176, "learning_rate": 1.6349414050208235e-05, "loss": 0.2165, "num_input_tokens_seen": 4776920, "step": 25060 }, { "epoch": 13.027546777546778, "grad_norm": 1.4182469844818115, "learning_rate": 1.633877621213192e-05, "loss": 0.1248, "num_input_tokens_seen": 4777848, "step": 25065 }, { "epoch": 13.03014553014553, "grad_norm": 1.0440168380737305, "learning_rate": 1.6328140155875733e-05, "loss": 0.0793, "num_input_tokens_seen": 4778808, "step": 25070 }, { "epoch": 13.032744282744282, "grad_norm": 1.0485448837280273, "learning_rate": 1.6317505883627764e-05, "loss": 0.1878, "num_input_tokens_seen": 4779832, "step": 25075 }, { "epoch": 13.035343035343036, "grad_norm": 0.2997657060623169, "learning_rate": 1.630687339757574e-05, "loss": 0.0637, "num_input_tokens_seen": 4780792, "step": 25080 }, { "epoch": 13.037941787941788, "grad_norm": 0.06615975499153137, "learning_rate": 1.6296242699907017e-05, "loss": 0.1181, "num_input_tokens_seen": 4781720, "step": 25085 }, { "epoch": 13.04054054054054, "grad_norm": 0.5883814096450806, "learning_rate": 1.628561379280858e-05, "loss": 0.134, "num_input_tokens_seen": 4782680, "step": 25090 }, { "epoch": 13.043139293139292, "grad_norm": 0.11674127727746964, "learning_rate": 1.6274986678467046e-05, "loss": 0.0718, "num_input_tokens_seen": 4783640, "step": 25095 }, { "epoch": 13.045738045738046, "grad_norm": 0.2363821268081665, "learning_rate": 1.626436135906867e-05, "loss": 0.1049, "num_input_tokens_seen": 4784664, "step": 25100 }, { "epoch": 13.048336798336798, "grad_norm": 1.724947452545166, "learning_rate": 1.6253737836799337e-05, "loss": 0.1287, "num_input_tokens_seen": 4785592, "step": 25105 }, { "epoch": 13.05093555093555, "grad_norm": 0.20748457312583923, "learning_rate": 1.624311611384455e-05, "loss": 0.11, "num_input_tokens_seen": 4786584, "step": 25110 }, { "epoch": 13.053534303534304, "grad_norm": 0.26620572805404663, "learning_rate": 1.623249619238946e-05, "loss": 0.0762, "num_input_tokens_seen": 4787512, "step": 25115 }, { "epoch": 13.056133056133056, "grad_norm": 0.7623336911201477, "learning_rate": 1.6221878074618827e-05, "loss": 0.0906, "num_input_tokens_seen": 4788376, "step": 25120 }, { "epoch": 13.058731808731808, "grad_norm": 0.7858453989028931, "learning_rate": 1.6211261762717067e-05, "loss": 0.1318, "num_input_tokens_seen": 4789368, "step": 25125 }, { "epoch": 13.06133056133056, "grad_norm": 1.4192105531692505, "learning_rate": 1.620064725886819e-05, "loss": 0.1545, "num_input_tokens_seen": 4790296, "step": 25130 }, { "epoch": 13.063929313929314, "grad_norm": 1.0313656330108643, "learning_rate": 1.6190034565255856e-05, "loss": 0.1454, "num_input_tokens_seen": 4791288, "step": 25135 }, { "epoch": 13.066528066528067, "grad_norm": 0.20415665209293365, "learning_rate": 1.617942368406336e-05, "loss": 0.1819, "num_input_tokens_seen": 4792248, "step": 25140 }, { "epoch": 13.069126819126819, "grad_norm": 0.5728204250335693, "learning_rate": 1.61688146174736e-05, "loss": 0.0613, "num_input_tokens_seen": 4793176, "step": 25145 }, { "epoch": 13.071725571725572, "grad_norm": 0.9616571664810181, "learning_rate": 1.6158207367669125e-05, "loss": 0.0966, "num_input_tokens_seen": 4794104, "step": 25150 }, { "epoch": 13.074324324324325, "grad_norm": 0.2721901834011078, "learning_rate": 1.614760193683209e-05, "loss": 0.0704, "num_input_tokens_seen": 4795064, "step": 25155 }, { "epoch": 13.076923076923077, "grad_norm": 0.9670400023460388, "learning_rate": 1.6136998327144293e-05, "loss": 0.1559, "num_input_tokens_seen": 4795960, "step": 25160 }, { "epoch": 13.079521829521829, "grad_norm": 1.5324918031692505, "learning_rate": 1.6126396540787143e-05, "loss": 0.1263, "num_input_tokens_seen": 4796856, "step": 25165 }, { "epoch": 13.082120582120583, "grad_norm": 0.11163390427827835, "learning_rate": 1.611579657994168e-05, "loss": 0.0958, "num_input_tokens_seen": 4797784, "step": 25170 }, { "epoch": 13.084719334719335, "grad_norm": 0.4936670958995819, "learning_rate": 1.6105198446788572e-05, "loss": 0.1622, "num_input_tokens_seen": 4798680, "step": 25175 }, { "epoch": 13.087318087318087, "grad_norm": 0.9698746800422668, "learning_rate": 1.6094602143508112e-05, "loss": 0.1242, "num_input_tokens_seen": 4799672, "step": 25180 }, { "epoch": 13.08991683991684, "grad_norm": 0.3139577805995941, "learning_rate": 1.60840076722802e-05, "loss": 0.1904, "num_input_tokens_seen": 4800664, "step": 25185 }, { "epoch": 13.092515592515593, "grad_norm": 0.21524128317832947, "learning_rate": 1.6073415035284383e-05, "loss": 0.1181, "num_input_tokens_seen": 4801624, "step": 25190 }, { "epoch": 13.095114345114345, "grad_norm": 0.13288937509059906, "learning_rate": 1.6062824234699813e-05, "loss": 0.105, "num_input_tokens_seen": 4802552, "step": 25195 }, { "epoch": 13.097713097713097, "grad_norm": 0.1055823341012001, "learning_rate": 1.6052235272705267e-05, "loss": 0.0838, "num_input_tokens_seen": 4803544, "step": 25200 }, { "epoch": 13.10031185031185, "grad_norm": 1.4553463459014893, "learning_rate": 1.6041648151479154e-05, "loss": 0.2347, "num_input_tokens_seen": 4804536, "step": 25205 }, { "epoch": 13.102910602910603, "grad_norm": 0.17238245904445648, "learning_rate": 1.6031062873199488e-05, "loss": 0.1467, "num_input_tokens_seen": 4805496, "step": 25210 }, { "epoch": 13.105509355509355, "grad_norm": 0.4552723467350006, "learning_rate": 1.6020479440043922e-05, "loss": 0.1254, "num_input_tokens_seen": 4806488, "step": 25215 }, { "epoch": 13.108108108108109, "grad_norm": 0.36958619952201843, "learning_rate": 1.6009897854189714e-05, "loss": 0.1371, "num_input_tokens_seen": 4807448, "step": 25220 }, { "epoch": 13.11070686070686, "grad_norm": 0.7554356455802917, "learning_rate": 1.599931811781375e-05, "loss": 0.2196, "num_input_tokens_seen": 4808344, "step": 25225 }, { "epoch": 13.113305613305613, "grad_norm": 0.5466620922088623, "learning_rate": 1.598874023309254e-05, "loss": 0.0959, "num_input_tokens_seen": 4809240, "step": 25230 }, { "epoch": 13.115904365904365, "grad_norm": 0.42961984872817993, "learning_rate": 1.59781642022022e-05, "loss": 0.1914, "num_input_tokens_seen": 4810232, "step": 25235 }, { "epoch": 13.118503118503119, "grad_norm": 0.43749159574508667, "learning_rate": 1.5967590027318478e-05, "loss": 0.343, "num_input_tokens_seen": 4811256, "step": 25240 }, { "epoch": 13.121101871101871, "grad_norm": 0.578099250793457, "learning_rate": 1.5957017710616722e-05, "loss": 0.2652, "num_input_tokens_seen": 4812184, "step": 25245 }, { "epoch": 13.123700623700623, "grad_norm": 1.057769775390625, "learning_rate": 1.5946447254271924e-05, "loss": 0.1446, "num_input_tokens_seen": 4813176, "step": 25250 }, { "epoch": 13.126299376299377, "grad_norm": 1.169089913368225, "learning_rate": 1.5935878660458675e-05, "loss": 0.2292, "num_input_tokens_seen": 4814200, "step": 25255 }, { "epoch": 13.128898128898129, "grad_norm": 0.17813095450401306, "learning_rate": 1.5925311931351172e-05, "loss": 0.1027, "num_input_tokens_seen": 4815160, "step": 25260 }, { "epoch": 13.131496881496881, "grad_norm": 0.20779402554035187, "learning_rate": 1.5914747069123263e-05, "loss": 0.0554, "num_input_tokens_seen": 4816120, "step": 25265 }, { "epoch": 13.134095634095635, "grad_norm": 0.6614082455635071, "learning_rate": 1.5904184075948375e-05, "loss": 0.1577, "num_input_tokens_seen": 4817112, "step": 25270 }, { "epoch": 13.136694386694387, "grad_norm": 0.14094889163970947, "learning_rate": 1.589362295399958e-05, "loss": 0.0625, "num_input_tokens_seen": 4818040, "step": 25275 }, { "epoch": 13.13929313929314, "grad_norm": 0.4453447163105011, "learning_rate": 1.5883063705449546e-05, "loss": 0.1111, "num_input_tokens_seen": 4819032, "step": 25280 }, { "epoch": 13.141891891891891, "grad_norm": 0.2376653552055359, "learning_rate": 1.587250633247057e-05, "loss": 0.1091, "num_input_tokens_seen": 4819992, "step": 25285 }, { "epoch": 13.144490644490645, "grad_norm": 0.1800709217786789, "learning_rate": 1.586195083723454e-05, "loss": 0.1565, "num_input_tokens_seen": 4820984, "step": 25290 }, { "epoch": 13.147089397089397, "grad_norm": 0.6500412821769714, "learning_rate": 1.5851397221912993e-05, "loss": 0.0834, "num_input_tokens_seen": 4821912, "step": 25295 }, { "epoch": 13.14968814968815, "grad_norm": 0.13741780817508698, "learning_rate": 1.584084548867704e-05, "loss": 0.1126, "num_input_tokens_seen": 4822808, "step": 25300 }, { "epoch": 13.152286902286903, "grad_norm": 0.70419842004776, "learning_rate": 1.5830295639697435e-05, "loss": 0.1284, "num_input_tokens_seen": 4823672, "step": 25305 }, { "epoch": 13.154885654885655, "grad_norm": 1.3456217050552368, "learning_rate": 1.581974767714453e-05, "loss": 0.1264, "num_input_tokens_seen": 4824568, "step": 25310 }, { "epoch": 13.157484407484407, "grad_norm": 0.4863835275173187, "learning_rate": 1.5809201603188296e-05, "loss": 0.1107, "num_input_tokens_seen": 4825528, "step": 25315 }, { "epoch": 13.16008316008316, "grad_norm": 0.7879239320755005, "learning_rate": 1.5798657419998313e-05, "loss": 0.1106, "num_input_tokens_seen": 4826488, "step": 25320 }, { "epoch": 13.162681912681913, "grad_norm": 0.1351436972618103, "learning_rate": 1.5788115129743757e-05, "loss": 0.0735, "num_input_tokens_seen": 4827480, "step": 25325 }, { "epoch": 13.165280665280665, "grad_norm": 0.5707968473434448, "learning_rate": 1.5777574734593442e-05, "loss": 0.1147, "num_input_tokens_seen": 4828472, "step": 25330 }, { "epoch": 13.167879417879417, "grad_norm": 0.2574731111526489, "learning_rate": 1.576703623671577e-05, "loss": 0.1738, "num_input_tokens_seen": 4829432, "step": 25335 }, { "epoch": 13.170478170478171, "grad_norm": 0.8901452422142029, "learning_rate": 1.575649963827877e-05, "loss": 0.1854, "num_input_tokens_seen": 4830424, "step": 25340 }, { "epoch": 13.173076923076923, "grad_norm": 0.10566980391740799, "learning_rate": 1.5745964941450063e-05, "loss": 0.1446, "num_input_tokens_seen": 4831352, "step": 25345 }, { "epoch": 13.175675675675675, "grad_norm": 0.11524783819913864, "learning_rate": 1.573543214839689e-05, "loss": 0.0478, "num_input_tokens_seen": 4832376, "step": 25350 }, { "epoch": 13.178274428274428, "grad_norm": 1.4577298164367676, "learning_rate": 1.5724901261286105e-05, "loss": 0.0923, "num_input_tokens_seen": 4833304, "step": 25355 }, { "epoch": 13.180873180873181, "grad_norm": 0.38776126503944397, "learning_rate": 1.5714372282284148e-05, "loss": 0.1776, "num_input_tokens_seen": 4834264, "step": 25360 }, { "epoch": 13.183471933471933, "grad_norm": 0.7881196737289429, "learning_rate": 1.5703845213557094e-05, "loss": 0.0587, "num_input_tokens_seen": 4835224, "step": 25365 }, { "epoch": 13.186070686070686, "grad_norm": 0.6701270937919617, "learning_rate": 1.5693320057270604e-05, "loss": 0.1045, "num_input_tokens_seen": 4836248, "step": 25370 }, { "epoch": 13.18866943866944, "grad_norm": 0.5241432785987854, "learning_rate": 1.5682796815589964e-05, "loss": 0.097, "num_input_tokens_seen": 4837176, "step": 25375 }, { "epoch": 13.191268191268192, "grad_norm": 0.6867320537567139, "learning_rate": 1.5672275490680034e-05, "loss": 0.1945, "num_input_tokens_seen": 4838136, "step": 25380 }, { "epoch": 13.193866943866944, "grad_norm": 0.1486596018075943, "learning_rate": 1.566175608470533e-05, "loss": 0.1672, "num_input_tokens_seen": 4839096, "step": 25385 }, { "epoch": 13.196465696465696, "grad_norm": 0.30037909746170044, "learning_rate": 1.5651238599829926e-05, "loss": 0.0852, "num_input_tokens_seen": 4839992, "step": 25390 }, { "epoch": 13.19906444906445, "grad_norm": 1.6744303703308105, "learning_rate": 1.5640723038217514e-05, "loss": 0.1404, "num_input_tokens_seen": 4840920, "step": 25395 }, { "epoch": 13.201663201663202, "grad_norm": 0.14663632214069366, "learning_rate": 1.563020940203141e-05, "loss": 0.0682, "num_input_tokens_seen": 4841848, "step": 25400 }, { "epoch": 13.204261954261954, "grad_norm": 0.833950936794281, "learning_rate": 1.5619697693434514e-05, "loss": 0.1609, "num_input_tokens_seen": 4842808, "step": 25405 }, { "epoch": 13.206860706860708, "grad_norm": 0.7050344347953796, "learning_rate": 1.5609187914589335e-05, "loss": 0.3442, "num_input_tokens_seen": 4843736, "step": 25410 }, { "epoch": 13.20945945945946, "grad_norm": 0.19074653089046478, "learning_rate": 1.559868006765798e-05, "loss": 0.1451, "num_input_tokens_seen": 4844632, "step": 25415 }, { "epoch": 13.212058212058212, "grad_norm": 0.09332886338233948, "learning_rate": 1.5588174154802173e-05, "loss": 0.1048, "num_input_tokens_seen": 4845560, "step": 25420 }, { "epoch": 13.214656964656964, "grad_norm": 0.1048574149608612, "learning_rate": 1.5577670178183223e-05, "loss": 0.0654, "num_input_tokens_seen": 4846424, "step": 25425 }, { "epoch": 13.217255717255718, "grad_norm": 0.3770976960659027, "learning_rate": 1.5567168139962055e-05, "loss": 0.1255, "num_input_tokens_seen": 4847416, "step": 25430 }, { "epoch": 13.21985446985447, "grad_norm": 0.6777969002723694, "learning_rate": 1.5556668042299178e-05, "loss": 0.2045, "num_input_tokens_seen": 4848408, "step": 25435 }, { "epoch": 13.222453222453222, "grad_norm": 0.8616116046905518, "learning_rate": 1.5546169887354726e-05, "loss": 0.1536, "num_input_tokens_seen": 4849336, "step": 25440 }, { "epoch": 13.225051975051976, "grad_norm": 0.8044271469116211, "learning_rate": 1.5535673677288406e-05, "loss": 0.0755, "num_input_tokens_seen": 4850264, "step": 25445 }, { "epoch": 13.227650727650728, "grad_norm": 1.0716365575790405, "learning_rate": 1.5525179414259556e-05, "loss": 0.1289, "num_input_tokens_seen": 4851128, "step": 25450 }, { "epoch": 13.23024948024948, "grad_norm": 0.8981881737709045, "learning_rate": 1.5514687100427083e-05, "loss": 0.1677, "num_input_tokens_seen": 4852088, "step": 25455 }, { "epoch": 13.232848232848234, "grad_norm": 0.625151515007019, "learning_rate": 1.55041967379495e-05, "loss": 0.0669, "num_input_tokens_seen": 4852984, "step": 25460 }, { "epoch": 13.235446985446986, "grad_norm": 0.8199810981750488, "learning_rate": 1.549370832898494e-05, "loss": 0.0703, "num_input_tokens_seen": 4853912, "step": 25465 }, { "epoch": 13.238045738045738, "grad_norm": 0.5158668160438538, "learning_rate": 1.5483221875691103e-05, "loss": 0.2469, "num_input_tokens_seen": 4854872, "step": 25470 }, { "epoch": 13.24064449064449, "grad_norm": 0.05673262104392052, "learning_rate": 1.5472737380225314e-05, "loss": 0.1395, "num_input_tokens_seen": 4855864, "step": 25475 }, { "epoch": 13.243243243243244, "grad_norm": 0.28288406133651733, "learning_rate": 1.5462254844744485e-05, "loss": 0.1233, "num_input_tokens_seen": 4856728, "step": 25480 }, { "epoch": 13.245841995841996, "grad_norm": 0.5911988615989685, "learning_rate": 1.5451774271405113e-05, "loss": 0.0914, "num_input_tokens_seen": 4857624, "step": 25485 }, { "epoch": 13.248440748440748, "grad_norm": 0.6234331130981445, "learning_rate": 1.544129566236331e-05, "loss": 0.0339, "num_input_tokens_seen": 4858520, "step": 25490 }, { "epoch": 13.2510395010395, "grad_norm": 0.7737608551979065, "learning_rate": 1.5430819019774767e-05, "loss": 0.149, "num_input_tokens_seen": 4859512, "step": 25495 }, { "epoch": 13.253638253638254, "grad_norm": 0.9948777556419373, "learning_rate": 1.5420344345794793e-05, "loss": 0.1315, "num_input_tokens_seen": 4860408, "step": 25500 }, { "epoch": 13.256237006237006, "grad_norm": 0.6827519536018372, "learning_rate": 1.5409871642578266e-05, "loss": 0.1303, "num_input_tokens_seen": 4861304, "step": 25505 }, { "epoch": 13.258835758835758, "grad_norm": 0.7073098421096802, "learning_rate": 1.5399400912279675e-05, "loss": 0.1491, "num_input_tokens_seen": 4862296, "step": 25510 }, { "epoch": 13.261434511434512, "grad_norm": 0.17597362399101257, "learning_rate": 1.538893215705311e-05, "loss": 0.0734, "num_input_tokens_seen": 4863288, "step": 25515 }, { "epoch": 13.264033264033264, "grad_norm": 0.13728010654449463, "learning_rate": 1.5378465379052217e-05, "loss": 0.0596, "num_input_tokens_seen": 4864280, "step": 25520 }, { "epoch": 13.266632016632016, "grad_norm": 0.5947593450546265, "learning_rate": 1.5368000580430282e-05, "loss": 0.0671, "num_input_tokens_seen": 4865176, "step": 25525 }, { "epoch": 13.26923076923077, "grad_norm": 0.44807496666908264, "learning_rate": 1.5357537763340154e-05, "loss": 0.0591, "num_input_tokens_seen": 4866104, "step": 25530 }, { "epoch": 13.271829521829522, "grad_norm": 1.3551849126815796, "learning_rate": 1.5347076929934293e-05, "loss": 0.1768, "num_input_tokens_seen": 4867032, "step": 25535 }, { "epoch": 13.274428274428274, "grad_norm": 0.2225542515516281, "learning_rate": 1.533661808236473e-05, "loss": 0.0764, "num_input_tokens_seen": 4867992, "step": 25540 }, { "epoch": 13.277027027027026, "grad_norm": 0.9107880592346191, "learning_rate": 1.532616122278311e-05, "loss": 0.1736, "num_input_tokens_seen": 4869016, "step": 25545 }, { "epoch": 13.27962577962578, "grad_norm": 0.4444396197795868, "learning_rate": 1.5315706353340654e-05, "loss": 0.1777, "num_input_tokens_seen": 4869944, "step": 25550 }, { "epoch": 13.282224532224532, "grad_norm": 1.4667760133743286, "learning_rate": 1.5305253476188176e-05, "loss": 0.1525, "num_input_tokens_seen": 4870904, "step": 25555 }, { "epoch": 13.284823284823284, "grad_norm": 0.5397974848747253, "learning_rate": 1.529480259347608e-05, "loss": 0.0714, "num_input_tokens_seen": 4871864, "step": 25560 }, { "epoch": 13.287422037422038, "grad_norm": 1.2262601852416992, "learning_rate": 1.5284353707354372e-05, "loss": 0.0817, "num_input_tokens_seen": 4872824, "step": 25565 }, { "epoch": 13.29002079002079, "grad_norm": 0.6648799777030945, "learning_rate": 1.5273906819972622e-05, "loss": 0.1676, "num_input_tokens_seen": 4873848, "step": 25570 }, { "epoch": 13.292619542619542, "grad_norm": 1.2525477409362793, "learning_rate": 1.526346193348002e-05, "loss": 0.2293, "num_input_tokens_seen": 4874776, "step": 25575 }, { "epoch": 13.295218295218294, "grad_norm": 0.7006199955940247, "learning_rate": 1.5253019050025325e-05, "loss": 0.2093, "num_input_tokens_seen": 4875672, "step": 25580 }, { "epoch": 13.297817047817048, "grad_norm": 0.4609447121620178, "learning_rate": 1.5242578171756866e-05, "loss": 0.2097, "num_input_tokens_seen": 4876632, "step": 25585 }, { "epoch": 13.3004158004158, "grad_norm": 0.1961633712053299, "learning_rate": 1.5232139300822607e-05, "loss": 0.104, "num_input_tokens_seen": 4877624, "step": 25590 }, { "epoch": 13.303014553014552, "grad_norm": 0.22634242475032806, "learning_rate": 1.5221702439370056e-05, "loss": 0.1888, "num_input_tokens_seen": 4878488, "step": 25595 }, { "epoch": 13.305613305613306, "grad_norm": 1.1127395629882812, "learning_rate": 1.5211267589546327e-05, "loss": 0.1284, "num_input_tokens_seen": 4879480, "step": 25600 }, { "epoch": 13.308212058212058, "grad_norm": 0.6013505458831787, "learning_rate": 1.5200834753498128e-05, "loss": 0.1002, "num_input_tokens_seen": 4880440, "step": 25605 }, { "epoch": 13.31081081081081, "grad_norm": 0.2791110873222351, "learning_rate": 1.5190403933371728e-05, "loss": 0.1403, "num_input_tokens_seen": 4881368, "step": 25610 }, { "epoch": 13.313409563409563, "grad_norm": 0.5244098901748657, "learning_rate": 1.5179975131313004e-05, "loss": 0.15, "num_input_tokens_seen": 4882296, "step": 25615 }, { "epoch": 13.316008316008316, "grad_norm": 0.5248591303825378, "learning_rate": 1.5169548349467406e-05, "loss": 0.1008, "num_input_tokens_seen": 4883256, "step": 25620 }, { "epoch": 13.318607068607069, "grad_norm": 0.09447003155946732, "learning_rate": 1.515912358997997e-05, "loss": 0.2053, "num_input_tokens_seen": 4884248, "step": 25625 }, { "epoch": 13.32120582120582, "grad_norm": 0.6468590497970581, "learning_rate": 1.5148700854995317e-05, "loss": 0.1026, "num_input_tokens_seen": 4885336, "step": 25630 }, { "epoch": 13.323804573804575, "grad_norm": 0.9869291186332703, "learning_rate": 1.5138280146657664e-05, "loss": 0.1726, "num_input_tokens_seen": 4886456, "step": 25635 }, { "epoch": 13.326403326403327, "grad_norm": 0.4359591007232666, "learning_rate": 1.5127861467110782e-05, "loss": 0.143, "num_input_tokens_seen": 4887480, "step": 25640 }, { "epoch": 13.329002079002079, "grad_norm": 0.49847349524497986, "learning_rate": 1.5117444818498056e-05, "loss": 0.1083, "num_input_tokens_seen": 4888472, "step": 25645 }, { "epoch": 13.33160083160083, "grad_norm": 0.36622628569602966, "learning_rate": 1.510703020296243e-05, "loss": 0.0711, "num_input_tokens_seen": 4889400, "step": 25650 }, { "epoch": 13.334199584199585, "grad_norm": 1.0656728744506836, "learning_rate": 1.5096617622646434e-05, "loss": 0.2113, "num_input_tokens_seen": 4890392, "step": 25655 }, { "epoch": 13.336798336798337, "grad_norm": 0.33327406644821167, "learning_rate": 1.5086207079692194e-05, "loss": 0.1019, "num_input_tokens_seen": 4891320, "step": 25660 }, { "epoch": 13.339397089397089, "grad_norm": 0.9520492553710938, "learning_rate": 1.5075798576241401e-05, "loss": 0.0793, "num_input_tokens_seen": 4892344, "step": 25665 }, { "epoch": 13.341995841995843, "grad_norm": 0.7870372533798218, "learning_rate": 1.506539211443534e-05, "loss": 0.1341, "num_input_tokens_seen": 4893272, "step": 25670 }, { "epoch": 13.344594594594595, "grad_norm": 0.47458475828170776, "learning_rate": 1.5054987696414851e-05, "loss": 0.199, "num_input_tokens_seen": 4894232, "step": 25675 }, { "epoch": 13.347193347193347, "grad_norm": 0.057499635964632034, "learning_rate": 1.5044585324320393e-05, "loss": 0.0834, "num_input_tokens_seen": 4895096, "step": 25680 }, { "epoch": 13.3497920997921, "grad_norm": 0.7789453864097595, "learning_rate": 1.5034185000291967e-05, "loss": 0.0875, "num_input_tokens_seen": 4896120, "step": 25685 }, { "epoch": 13.352390852390853, "grad_norm": 0.6716204881668091, "learning_rate": 1.5023786726469171e-05, "loss": 0.1759, "num_input_tokens_seen": 4897144, "step": 25690 }, { "epoch": 13.354989604989605, "grad_norm": 0.22915643453598022, "learning_rate": 1.5013390504991175e-05, "loss": 0.0782, "num_input_tokens_seen": 4898072, "step": 25695 }, { "epoch": 13.357588357588357, "grad_norm": 2.725886106491089, "learning_rate": 1.5002996337996742e-05, "loss": 0.2058, "num_input_tokens_seen": 4898968, "step": 25700 }, { "epoch": 13.36018711018711, "grad_norm": 0.8585901856422424, "learning_rate": 1.4992604227624182e-05, "loss": 0.2102, "num_input_tokens_seen": 4899992, "step": 25705 }, { "epoch": 13.362785862785863, "grad_norm": 0.10598879307508469, "learning_rate": 1.4982214176011419e-05, "loss": 0.1257, "num_input_tokens_seen": 4900952, "step": 25710 }, { "epoch": 13.365384615384615, "grad_norm": 0.6523299813270569, "learning_rate": 1.4971826185295917e-05, "loss": 0.1303, "num_input_tokens_seen": 4901880, "step": 25715 }, { "epoch": 13.367983367983369, "grad_norm": 0.7922728061676025, "learning_rate": 1.4961440257614739e-05, "loss": 0.1061, "num_input_tokens_seen": 4902872, "step": 25720 }, { "epoch": 13.370582120582121, "grad_norm": 1.0019656419754028, "learning_rate": 1.4951056395104515e-05, "loss": 0.0574, "num_input_tokens_seen": 4903736, "step": 25725 }, { "epoch": 13.373180873180873, "grad_norm": 0.17745201289653778, "learning_rate": 1.494067459990146e-05, "loss": 0.1221, "num_input_tokens_seen": 4904664, "step": 25730 }, { "epoch": 13.375779625779625, "grad_norm": 0.898540198802948, "learning_rate": 1.4930294874141354e-05, "loss": 0.1361, "num_input_tokens_seen": 4905624, "step": 25735 }, { "epoch": 13.378378378378379, "grad_norm": 0.8913448452949524, "learning_rate": 1.4919917219959556e-05, "loss": 0.1264, "num_input_tokens_seen": 4906616, "step": 25740 }, { "epoch": 13.380977130977131, "grad_norm": 0.7897789478302002, "learning_rate": 1.4909541639490987e-05, "loss": 0.1695, "num_input_tokens_seen": 4907544, "step": 25745 }, { "epoch": 13.383575883575883, "grad_norm": 0.6351937651634216, "learning_rate": 1.4899168134870161e-05, "loss": 0.0764, "num_input_tokens_seen": 4908472, "step": 25750 }, { "epoch": 13.386174636174637, "grad_norm": 0.44941604137420654, "learning_rate": 1.488879670823115e-05, "loss": 0.1394, "num_input_tokens_seen": 4909368, "step": 25755 }, { "epoch": 13.388773388773389, "grad_norm": 0.7425285577774048, "learning_rate": 1.4878427361707614e-05, "loss": 0.0734, "num_input_tokens_seen": 4910328, "step": 25760 }, { "epoch": 13.391372141372141, "grad_norm": 0.2147183120250702, "learning_rate": 1.4868060097432756e-05, "loss": 0.0796, "num_input_tokens_seen": 4911224, "step": 25765 }, { "epoch": 13.393970893970893, "grad_norm": 0.07738599181175232, "learning_rate": 1.4857694917539389e-05, "loss": 0.0815, "num_input_tokens_seen": 4912088, "step": 25770 }, { "epoch": 13.396569646569647, "grad_norm": 0.3300643265247345, "learning_rate": 1.484733182415986e-05, "loss": 0.0568, "num_input_tokens_seen": 4913016, "step": 25775 }, { "epoch": 13.3991683991684, "grad_norm": 0.11486146599054337, "learning_rate": 1.4836970819426124e-05, "loss": 0.1559, "num_input_tokens_seen": 4913944, "step": 25780 }, { "epoch": 13.401767151767151, "grad_norm": 0.11144135892391205, "learning_rate": 1.482661190546967e-05, "loss": 0.1352, "num_input_tokens_seen": 4914968, "step": 25785 }, { "epoch": 13.404365904365905, "grad_norm": 0.15404890477657318, "learning_rate": 1.4816255084421574e-05, "loss": 0.1043, "num_input_tokens_seen": 4916024, "step": 25790 }, { "epoch": 13.406964656964657, "grad_norm": 0.1616349071264267, "learning_rate": 1.480590035841249e-05, "loss": 0.1419, "num_input_tokens_seen": 4916984, "step": 25795 }, { "epoch": 13.40956340956341, "grad_norm": 0.1145535409450531, "learning_rate": 1.4795547729572623e-05, "loss": 0.1501, "num_input_tokens_seen": 4917944, "step": 25800 }, { "epoch": 13.412162162162161, "grad_norm": 1.0328437089920044, "learning_rate": 1.4785197200031764e-05, "loss": 0.1443, "num_input_tokens_seen": 4919000, "step": 25805 }, { "epoch": 13.414760914760915, "grad_norm": 0.8530417084693909, "learning_rate": 1.4774848771919252e-05, "loss": 0.1507, "num_input_tokens_seen": 4919864, "step": 25810 }, { "epoch": 13.417359667359667, "grad_norm": 0.22958961129188538, "learning_rate": 1.476450244736402e-05, "loss": 0.1904, "num_input_tokens_seen": 4920856, "step": 25815 }, { "epoch": 13.41995841995842, "grad_norm": 2.425165891647339, "learning_rate": 1.4754158228494544e-05, "loss": 0.1592, "num_input_tokens_seen": 4921848, "step": 25820 }, { "epoch": 13.422557172557173, "grad_norm": 0.17501795291900635, "learning_rate": 1.474381611743888e-05, "loss": 0.2093, "num_input_tokens_seen": 4922840, "step": 25825 }, { "epoch": 13.425155925155925, "grad_norm": 0.2630009055137634, "learning_rate": 1.473347611632464e-05, "loss": 0.1188, "num_input_tokens_seen": 4923832, "step": 25830 }, { "epoch": 13.427754677754677, "grad_norm": 1.0153446197509766, "learning_rate": 1.4723138227279015e-05, "loss": 0.1913, "num_input_tokens_seen": 4924792, "step": 25835 }, { "epoch": 13.43035343035343, "grad_norm": 1.0730141401290894, "learning_rate": 1.4712802452428759e-05, "loss": 0.0929, "num_input_tokens_seen": 4925752, "step": 25840 }, { "epoch": 13.432952182952183, "grad_norm": 1.148997187614441, "learning_rate": 1.4702468793900188e-05, "loss": 0.1104, "num_input_tokens_seen": 4926712, "step": 25845 }, { "epoch": 13.435550935550935, "grad_norm": 1.6200605630874634, "learning_rate": 1.4692137253819166e-05, "loss": 0.1489, "num_input_tokens_seen": 4927672, "step": 25850 }, { "epoch": 13.438149688149688, "grad_norm": 0.09774915874004364, "learning_rate": 1.468180783431115e-05, "loss": 0.1655, "num_input_tokens_seen": 4928632, "step": 25855 }, { "epoch": 13.440748440748441, "grad_norm": 0.8472371101379395, "learning_rate": 1.4671480537501142e-05, "loss": 0.1342, "num_input_tokens_seen": 4929624, "step": 25860 }, { "epoch": 13.443347193347194, "grad_norm": 0.781446635723114, "learning_rate": 1.4661155365513717e-05, "loss": 0.1024, "num_input_tokens_seen": 4930520, "step": 25865 }, { "epoch": 13.445945945945946, "grad_norm": 0.2365603744983673, "learning_rate": 1.4650832320473008e-05, "loss": 0.0365, "num_input_tokens_seen": 4931480, "step": 25870 }, { "epoch": 13.448544698544698, "grad_norm": 0.9057862162590027, "learning_rate": 1.4640511404502713e-05, "loss": 0.2125, "num_input_tokens_seen": 4932472, "step": 25875 }, { "epoch": 13.451143451143452, "grad_norm": 1.0375392436981201, "learning_rate": 1.4630192619726083e-05, "loss": 0.1923, "num_input_tokens_seen": 4933432, "step": 25880 }, { "epoch": 13.453742203742204, "grad_norm": 0.1774883270263672, "learning_rate": 1.4619875968265951e-05, "loss": 0.0768, "num_input_tokens_seen": 4934392, "step": 25885 }, { "epoch": 13.456340956340956, "grad_norm": 0.5501959919929504, "learning_rate": 1.460956145224468e-05, "loss": 0.0802, "num_input_tokens_seen": 4935288, "step": 25890 }, { "epoch": 13.45893970893971, "grad_norm": 0.2763705253601074, "learning_rate": 1.4599249073784232e-05, "loss": 0.0448, "num_input_tokens_seen": 4936184, "step": 25895 }, { "epoch": 13.461538461538462, "grad_norm": 0.5899159908294678, "learning_rate": 1.4588938835006097e-05, "loss": 0.1035, "num_input_tokens_seen": 4937176, "step": 25900 }, { "epoch": 13.464137214137214, "grad_norm": 0.8488399982452393, "learning_rate": 1.4578630738031328e-05, "loss": 0.1662, "num_input_tokens_seen": 4938104, "step": 25905 }, { "epoch": 13.466735966735968, "grad_norm": 0.35384923219680786, "learning_rate": 1.456832478498058e-05, "loss": 0.1896, "num_input_tokens_seen": 4939064, "step": 25910 }, { "epoch": 13.46933471933472, "grad_norm": 0.6125838160514832, "learning_rate": 1.4558020977973998e-05, "loss": 0.1426, "num_input_tokens_seen": 4940088, "step": 25915 }, { "epoch": 13.471933471933472, "grad_norm": 0.2791254222393036, "learning_rate": 1.4547719319131325e-05, "loss": 0.0623, "num_input_tokens_seen": 4941016, "step": 25920 }, { "epoch": 13.474532224532224, "grad_norm": 0.7020288109779358, "learning_rate": 1.4537419810571873e-05, "loss": 0.073, "num_input_tokens_seen": 4941976, "step": 25925 }, { "epoch": 13.477130977130978, "grad_norm": 0.8107888698577881, "learning_rate": 1.4527122454414491e-05, "loss": 0.0991, "num_input_tokens_seen": 4942936, "step": 25930 }, { "epoch": 13.47972972972973, "grad_norm": 0.10741963982582092, "learning_rate": 1.4516827252777588e-05, "loss": 0.0726, "num_input_tokens_seen": 4943832, "step": 25935 }, { "epoch": 13.482328482328482, "grad_norm": 0.7835166454315186, "learning_rate": 1.450653420777912e-05, "loss": 0.1577, "num_input_tokens_seen": 4944824, "step": 25940 }, { "epoch": 13.484927234927236, "grad_norm": 1.9574006795883179, "learning_rate": 1.449624332153664e-05, "loss": 0.1361, "num_input_tokens_seen": 4945784, "step": 25945 }, { "epoch": 13.487525987525988, "grad_norm": 1.4166414737701416, "learning_rate": 1.4485954596167207e-05, "loss": 0.2356, "num_input_tokens_seen": 4946712, "step": 25950 }, { "epoch": 13.49012474012474, "grad_norm": 1.399687647819519, "learning_rate": 1.4475668033787465e-05, "loss": 0.1118, "num_input_tokens_seen": 4947640, "step": 25955 }, { "epoch": 13.492723492723492, "grad_norm": 0.08305925875902176, "learning_rate": 1.4465383636513597e-05, "loss": 0.1401, "num_input_tokens_seen": 4948568, "step": 25960 }, { "epoch": 13.495322245322246, "grad_norm": 0.41320961713790894, "learning_rate": 1.4455101406461358e-05, "loss": 0.2094, "num_input_tokens_seen": 4949496, "step": 25965 }, { "epoch": 13.497920997920998, "grad_norm": 0.25300997495651245, "learning_rate": 1.444482134574605e-05, "loss": 0.0743, "num_input_tokens_seen": 4950456, "step": 25970 }, { "epoch": 13.50051975051975, "grad_norm": 0.4646676182746887, "learning_rate": 1.443454345648252e-05, "loss": 0.102, "num_input_tokens_seen": 4951384, "step": 25975 }, { "epoch": 13.503118503118504, "grad_norm": 1.467594861984253, "learning_rate": 1.4424267740785171e-05, "loss": 0.187, "num_input_tokens_seen": 4952312, "step": 25980 }, { "epoch": 13.505717255717256, "grad_norm": 1.0535411834716797, "learning_rate": 1.4413994200767972e-05, "loss": 0.1712, "num_input_tokens_seen": 4953240, "step": 25985 }, { "epoch": 13.508316008316008, "grad_norm": 1.030263066291809, "learning_rate": 1.4403722838544418e-05, "loss": 0.202, "num_input_tokens_seen": 4954168, "step": 25990 }, { "epoch": 13.51091476091476, "grad_norm": 1.7014888525009155, "learning_rate": 1.4393453656227595e-05, "loss": 0.1467, "num_input_tokens_seen": 4955096, "step": 25995 }, { "epoch": 13.513513513513514, "grad_norm": 0.11649452894926071, "learning_rate": 1.4383186655930108e-05, "loss": 0.0415, "num_input_tokens_seen": 4956120, "step": 26000 }, { "epoch": 13.516112266112266, "grad_norm": 1.2938398122787476, "learning_rate": 1.4372921839764126e-05, "loss": 0.1173, "num_input_tokens_seen": 4957080, "step": 26005 }, { "epoch": 13.518711018711018, "grad_norm": 0.3383741080760956, "learning_rate": 1.436265920984135e-05, "loss": 0.0408, "num_input_tokens_seen": 4958008, "step": 26010 }, { "epoch": 13.521309771309772, "grad_norm": 0.746008574962616, "learning_rate": 1.4352398768273079e-05, "loss": 0.1796, "num_input_tokens_seen": 4959000, "step": 26015 }, { "epoch": 13.523908523908524, "grad_norm": 1.0994354486465454, "learning_rate": 1.4342140517170109e-05, "loss": 0.0945, "num_input_tokens_seen": 4959928, "step": 26020 }, { "epoch": 13.526507276507276, "grad_norm": 0.36614999175071716, "learning_rate": 1.433188445864281e-05, "loss": 0.1718, "num_input_tokens_seen": 4960920, "step": 26025 }, { "epoch": 13.529106029106028, "grad_norm": 0.12595953047275543, "learning_rate": 1.4321630594801092e-05, "loss": 0.0378, "num_input_tokens_seen": 4961784, "step": 26030 }, { "epoch": 13.531704781704782, "grad_norm": 0.5843955278396606, "learning_rate": 1.4311378927754435e-05, "loss": 0.176, "num_input_tokens_seen": 4962776, "step": 26035 }, { "epoch": 13.534303534303534, "grad_norm": 0.8764947652816772, "learning_rate": 1.4301129459611851e-05, "loss": 0.0649, "num_input_tokens_seen": 4963704, "step": 26040 }, { "epoch": 13.536902286902286, "grad_norm": 0.0958893671631813, "learning_rate": 1.4290882192481872e-05, "loss": 0.1041, "num_input_tokens_seen": 4964728, "step": 26045 }, { "epoch": 13.53950103950104, "grad_norm": 0.8484039902687073, "learning_rate": 1.4280637128472634e-05, "loss": 0.0909, "num_input_tokens_seen": 4965592, "step": 26050 }, { "epoch": 13.542099792099792, "grad_norm": 0.44185972213745117, "learning_rate": 1.4270394269691779e-05, "loss": 0.0762, "num_input_tokens_seen": 4966520, "step": 26055 }, { "epoch": 13.544698544698544, "grad_norm": 0.3886098265647888, "learning_rate": 1.4260153618246513e-05, "loss": 0.0709, "num_input_tokens_seen": 4967480, "step": 26060 }, { "epoch": 13.547297297297296, "grad_norm": 0.4038975238800049, "learning_rate": 1.4249915176243563e-05, "loss": 0.1191, "num_input_tokens_seen": 4968440, "step": 26065 }, { "epoch": 13.54989604989605, "grad_norm": 0.8368246555328369, "learning_rate": 1.4239678945789247e-05, "loss": 0.0445, "num_input_tokens_seen": 4969432, "step": 26070 }, { "epoch": 13.552494802494802, "grad_norm": 0.23571091890335083, "learning_rate": 1.4229444928989394e-05, "loss": 0.0771, "num_input_tokens_seen": 4970392, "step": 26075 }, { "epoch": 13.555093555093555, "grad_norm": 0.948326051235199, "learning_rate": 1.4219213127949377e-05, "loss": 0.1195, "num_input_tokens_seen": 4971320, "step": 26080 }, { "epoch": 13.557692307692308, "grad_norm": 1.1301252841949463, "learning_rate": 1.4208983544774113e-05, "loss": 0.2734, "num_input_tokens_seen": 4972248, "step": 26085 }, { "epoch": 13.56029106029106, "grad_norm": 0.17437449097633362, "learning_rate": 1.4198756181568099e-05, "loss": 0.1219, "num_input_tokens_seen": 4973208, "step": 26090 }, { "epoch": 13.562889812889813, "grad_norm": 0.8634201288223267, "learning_rate": 1.4188531040435326e-05, "loss": 0.1149, "num_input_tokens_seen": 4974232, "step": 26095 }, { "epoch": 13.565488565488565, "grad_norm": 0.6241894960403442, "learning_rate": 1.417830812347935e-05, "loss": 0.197, "num_input_tokens_seen": 4975224, "step": 26100 }, { "epoch": 13.568087318087318, "grad_norm": 1.198191523551941, "learning_rate": 1.4168087432803292e-05, "loss": 0.1069, "num_input_tokens_seen": 4976088, "step": 26105 }, { "epoch": 13.57068607068607, "grad_norm": 0.3899451196193695, "learning_rate": 1.4157868970509763e-05, "loss": 0.0471, "num_input_tokens_seen": 4977080, "step": 26110 }, { "epoch": 13.573284823284823, "grad_norm": 0.4581564962863922, "learning_rate": 1.4147652738700946e-05, "loss": 0.0745, "num_input_tokens_seen": 4978040, "step": 26115 }, { "epoch": 13.575883575883577, "grad_norm": 0.30808934569358826, "learning_rate": 1.413743873947858e-05, "loss": 0.1678, "num_input_tokens_seen": 4979000, "step": 26120 }, { "epoch": 13.578482328482329, "grad_norm": 0.3775588870048523, "learning_rate": 1.412722697494392e-05, "loss": 0.1556, "num_input_tokens_seen": 4980024, "step": 26125 }, { "epoch": 13.58108108108108, "grad_norm": 0.06976263970136642, "learning_rate": 1.411701744719777e-05, "loss": 0.1491, "num_input_tokens_seen": 4980984, "step": 26130 }, { "epoch": 13.583679833679835, "grad_norm": 1.1370174884796143, "learning_rate": 1.410681015834046e-05, "loss": 0.2231, "num_input_tokens_seen": 4981912, "step": 26135 }, { "epoch": 13.586278586278587, "grad_norm": 0.48414891958236694, "learning_rate": 1.4096605110471897e-05, "loss": 0.0889, "num_input_tokens_seen": 4982840, "step": 26140 }, { "epoch": 13.588877338877339, "grad_norm": 0.7873141765594482, "learning_rate": 1.408640230569149e-05, "loss": 0.1561, "num_input_tokens_seen": 4983832, "step": 26145 }, { "epoch": 13.59147609147609, "grad_norm": 1.2564536333084106, "learning_rate": 1.40762017460982e-05, "loss": 0.2592, "num_input_tokens_seen": 4984792, "step": 26150 }, { "epoch": 13.594074844074845, "grad_norm": 1.125820517539978, "learning_rate": 1.4066003433790514e-05, "loss": 0.1545, "num_input_tokens_seen": 4985720, "step": 26155 }, { "epoch": 13.596673596673597, "grad_norm": 0.11726906150579453, "learning_rate": 1.4055807370866485e-05, "loss": 0.1325, "num_input_tokens_seen": 4986712, "step": 26160 }, { "epoch": 13.599272349272349, "grad_norm": 0.6262134313583374, "learning_rate": 1.4045613559423685e-05, "loss": 0.0848, "num_input_tokens_seen": 4987672, "step": 26165 }, { "epoch": 13.601871101871101, "grad_norm": 1.4354641437530518, "learning_rate": 1.4035422001559218e-05, "loss": 0.1914, "num_input_tokens_seen": 4988600, "step": 26170 }, { "epoch": 13.604469854469855, "grad_norm": 0.36768975853919983, "learning_rate": 1.4025232699369727e-05, "loss": 0.2279, "num_input_tokens_seen": 4989560, "step": 26175 }, { "epoch": 13.607068607068607, "grad_norm": 0.3847762644290924, "learning_rate": 1.40150456549514e-05, "loss": 0.0797, "num_input_tokens_seen": 4990488, "step": 26180 }, { "epoch": 13.609667359667359, "grad_norm": 0.133498415350914, "learning_rate": 1.4004860870399955e-05, "loss": 0.1321, "num_input_tokens_seen": 4991512, "step": 26185 }, { "epoch": 13.612266112266113, "grad_norm": 0.05461416020989418, "learning_rate": 1.399467834781063e-05, "loss": 0.0592, "num_input_tokens_seen": 4992472, "step": 26190 }, { "epoch": 13.614864864864865, "grad_norm": 0.8100554943084717, "learning_rate": 1.3984498089278236e-05, "loss": 0.1242, "num_input_tokens_seen": 4993464, "step": 26195 }, { "epoch": 13.617463617463617, "grad_norm": 0.6047754287719727, "learning_rate": 1.3974320096897084e-05, "loss": 0.1864, "num_input_tokens_seen": 4994488, "step": 26200 }, { "epoch": 13.62006237006237, "grad_norm": 0.5698128342628479, "learning_rate": 1.3964144372761029e-05, "loss": 0.1667, "num_input_tokens_seen": 4995480, "step": 26205 }, { "epoch": 13.622661122661123, "grad_norm": 0.6463260054588318, "learning_rate": 1.3953970918963453e-05, "loss": 0.071, "num_input_tokens_seen": 4996408, "step": 26210 }, { "epoch": 13.625259875259875, "grad_norm": 0.10311494022607803, "learning_rate": 1.3943799737597294e-05, "loss": 0.1401, "num_input_tokens_seen": 4997240, "step": 26215 }, { "epoch": 13.627858627858627, "grad_norm": 0.42053496837615967, "learning_rate": 1.3933630830754999e-05, "loss": 0.1539, "num_input_tokens_seen": 4998168, "step": 26220 }, { "epoch": 13.630457380457381, "grad_norm": 0.40010151267051697, "learning_rate": 1.3923464200528541e-05, "loss": 0.0869, "num_input_tokens_seen": 4999128, "step": 26225 }, { "epoch": 13.633056133056133, "grad_norm": 0.47606995701789856, "learning_rate": 1.3913299849009465e-05, "loss": 0.0675, "num_input_tokens_seen": 5000024, "step": 26230 }, { "epoch": 13.635654885654885, "grad_norm": 1.3964438438415527, "learning_rate": 1.390313777828881e-05, "loss": 0.2306, "num_input_tokens_seen": 5000920, "step": 26235 }, { "epoch": 13.638253638253639, "grad_norm": 0.14342068135738373, "learning_rate": 1.3892977990457134e-05, "loss": 0.1193, "num_input_tokens_seen": 5001848, "step": 26240 }, { "epoch": 13.640852390852391, "grad_norm": 0.39160072803497314, "learning_rate": 1.3882820487604575e-05, "loss": 0.0843, "num_input_tokens_seen": 5002840, "step": 26245 }, { "epoch": 13.643451143451143, "grad_norm": 1.101550817489624, "learning_rate": 1.3872665271820767e-05, "loss": 0.1754, "num_input_tokens_seen": 5003896, "step": 26250 }, { "epoch": 13.646049896049895, "grad_norm": 0.5504142045974731, "learning_rate": 1.3862512345194872e-05, "loss": 0.0765, "num_input_tokens_seen": 5004856, "step": 26255 }, { "epoch": 13.64864864864865, "grad_norm": 0.5289038419723511, "learning_rate": 1.3852361709815586e-05, "loss": 0.3018, "num_input_tokens_seen": 5005816, "step": 26260 }, { "epoch": 13.651247401247401, "grad_norm": 0.18044933676719666, "learning_rate": 1.3842213367771157e-05, "loss": 0.1787, "num_input_tokens_seen": 5006744, "step": 26265 }, { "epoch": 13.653846153846153, "grad_norm": 1.1718096733093262, "learning_rate": 1.3832067321149326e-05, "loss": 0.2549, "num_input_tokens_seen": 5007672, "step": 26270 }, { "epoch": 13.656444906444907, "grad_norm": 0.12542998790740967, "learning_rate": 1.3821923572037377e-05, "loss": 0.0941, "num_input_tokens_seen": 5008568, "step": 26275 }, { "epoch": 13.65904365904366, "grad_norm": 2.1466662883758545, "learning_rate": 1.3811782122522113e-05, "loss": 0.178, "num_input_tokens_seen": 5009464, "step": 26280 }, { "epoch": 13.661642411642411, "grad_norm": 0.44805729389190674, "learning_rate": 1.3801642974689891e-05, "loss": 0.1733, "num_input_tokens_seen": 5010456, "step": 26285 }, { "epoch": 13.664241164241163, "grad_norm": 0.22245462238788605, "learning_rate": 1.3791506130626566e-05, "loss": 0.1892, "num_input_tokens_seen": 5011352, "step": 26290 }, { "epoch": 13.666839916839917, "grad_norm": 0.7574741840362549, "learning_rate": 1.3781371592417527e-05, "loss": 0.1443, "num_input_tokens_seen": 5012376, "step": 26295 }, { "epoch": 13.66943866943867, "grad_norm": 0.834897518157959, "learning_rate": 1.3771239362147693e-05, "loss": 0.0882, "num_input_tokens_seen": 5013336, "step": 26300 }, { "epoch": 13.672037422037421, "grad_norm": 0.5770482420921326, "learning_rate": 1.3761109441901504e-05, "loss": 0.1065, "num_input_tokens_seen": 5014264, "step": 26305 }, { "epoch": 13.674636174636175, "grad_norm": 0.33765625953674316, "learning_rate": 1.3750981833762922e-05, "loss": 0.1423, "num_input_tokens_seen": 5015256, "step": 26310 }, { "epoch": 13.677234927234927, "grad_norm": 0.9230561256408691, "learning_rate": 1.3740856539815433e-05, "loss": 0.0823, "num_input_tokens_seen": 5016184, "step": 26315 }, { "epoch": 13.67983367983368, "grad_norm": 0.15409880876541138, "learning_rate": 1.3730733562142068e-05, "loss": 0.1625, "num_input_tokens_seen": 5017176, "step": 26320 }, { "epoch": 13.682432432432432, "grad_norm": 0.5682739019393921, "learning_rate": 1.3720612902825352e-05, "loss": 0.1098, "num_input_tokens_seen": 5018136, "step": 26325 }, { "epoch": 13.685031185031185, "grad_norm": 1.0573668479919434, "learning_rate": 1.3710494563947357e-05, "loss": 0.0904, "num_input_tokens_seen": 5019096, "step": 26330 }, { "epoch": 13.687629937629938, "grad_norm": 0.27693143486976624, "learning_rate": 1.3700378547589643e-05, "loss": 0.1495, "num_input_tokens_seen": 5019992, "step": 26335 }, { "epoch": 13.69022869022869, "grad_norm": 0.5464162230491638, "learning_rate": 1.3690264855833343e-05, "loss": 0.1113, "num_input_tokens_seen": 5020920, "step": 26340 }, { "epoch": 13.692827442827443, "grad_norm": 1.676342248916626, "learning_rate": 1.3680153490759073e-05, "loss": 0.1561, "num_input_tokens_seen": 5021848, "step": 26345 }, { "epoch": 13.695426195426196, "grad_norm": 0.4058595895767212, "learning_rate": 1.3670044454446973e-05, "loss": 0.1011, "num_input_tokens_seen": 5022840, "step": 26350 }, { "epoch": 13.698024948024948, "grad_norm": 0.6942816972732544, "learning_rate": 1.3659937748976726e-05, "loss": 0.0974, "num_input_tokens_seen": 5023768, "step": 26355 }, { "epoch": 13.700623700623701, "grad_norm": 0.2917233109474182, "learning_rate": 1.3649833376427523e-05, "loss": 0.1709, "num_input_tokens_seen": 5024664, "step": 26360 }, { "epoch": 13.703222453222454, "grad_norm": 0.6275703310966492, "learning_rate": 1.3639731338878064e-05, "loss": 0.1694, "num_input_tokens_seen": 5025656, "step": 26365 }, { "epoch": 13.705821205821206, "grad_norm": 0.2644878625869751, "learning_rate": 1.3629631638406587e-05, "loss": 0.1202, "num_input_tokens_seen": 5026584, "step": 26370 }, { "epoch": 13.708419958419958, "grad_norm": 0.17181478440761566, "learning_rate": 1.3619534277090837e-05, "loss": 0.1723, "num_input_tokens_seen": 5027480, "step": 26375 }, { "epoch": 13.711018711018712, "grad_norm": 0.8883070349693298, "learning_rate": 1.3609439257008082e-05, "loss": 0.0743, "num_input_tokens_seen": 5028408, "step": 26380 }, { "epoch": 13.713617463617464, "grad_norm": 0.36074623465538025, "learning_rate": 1.35993465802351e-05, "loss": 0.1934, "num_input_tokens_seen": 5029304, "step": 26385 }, { "epoch": 13.716216216216216, "grad_norm": 1.494991660118103, "learning_rate": 1.3589256248848209e-05, "loss": 0.1789, "num_input_tokens_seen": 5030232, "step": 26390 }, { "epoch": 13.71881496881497, "grad_norm": 0.6532930135726929, "learning_rate": 1.3579168264923232e-05, "loss": 0.1773, "num_input_tokens_seen": 5031160, "step": 26395 }, { "epoch": 13.721413721413722, "grad_norm": 0.09257946163415909, "learning_rate": 1.3569082630535496e-05, "loss": 0.0812, "num_input_tokens_seen": 5032120, "step": 26400 }, { "epoch": 13.724012474012474, "grad_norm": 1.087863802909851, "learning_rate": 1.3558999347759855e-05, "loss": 0.1724, "num_input_tokens_seen": 5033048, "step": 26405 }, { "epoch": 13.726611226611226, "grad_norm": 0.7700700163841248, "learning_rate": 1.3548918418670692e-05, "loss": 0.1596, "num_input_tokens_seen": 5034040, "step": 26410 }, { "epoch": 13.72920997920998, "grad_norm": 0.3084530234336853, "learning_rate": 1.3538839845341892e-05, "loss": 0.078, "num_input_tokens_seen": 5035032, "step": 26415 }, { "epoch": 13.731808731808732, "grad_norm": 1.156084656715393, "learning_rate": 1.3528763629846854e-05, "loss": 0.2519, "num_input_tokens_seen": 5035960, "step": 26420 }, { "epoch": 13.734407484407484, "grad_norm": 0.22352737188339233, "learning_rate": 1.3518689774258487e-05, "loss": 0.0463, "num_input_tokens_seen": 5036920, "step": 26425 }, { "epoch": 13.737006237006238, "grad_norm": 0.7349374294281006, "learning_rate": 1.3508618280649255e-05, "loss": 0.0697, "num_input_tokens_seen": 5037848, "step": 26430 }, { "epoch": 13.73960498960499, "grad_norm": 1.4342743158340454, "learning_rate": 1.3498549151091066e-05, "loss": 0.1061, "num_input_tokens_seen": 5038808, "step": 26435 }, { "epoch": 13.742203742203742, "grad_norm": 0.5986635684967041, "learning_rate": 1.3488482387655393e-05, "loss": 0.1412, "num_input_tokens_seen": 5039736, "step": 26440 }, { "epoch": 13.744802494802494, "grad_norm": 0.3523922264575958, "learning_rate": 1.3478417992413216e-05, "loss": 0.0412, "num_input_tokens_seen": 5040632, "step": 26445 }, { "epoch": 13.747401247401248, "grad_norm": 1.4199559688568115, "learning_rate": 1.346835596743502e-05, "loss": 0.1719, "num_input_tokens_seen": 5041528, "step": 26450 }, { "epoch": 13.75, "grad_norm": 0.5706278681755066, "learning_rate": 1.3458296314790803e-05, "loss": 0.0874, "num_input_tokens_seen": 5042584, "step": 26455 }, { "epoch": 13.752598752598752, "grad_norm": 1.2084226608276367, "learning_rate": 1.3448239036550059e-05, "loss": 0.1318, "num_input_tokens_seen": 5043512, "step": 26460 }, { "epoch": 13.755197505197506, "grad_norm": 1.0145111083984375, "learning_rate": 1.3438184134781833e-05, "loss": 0.12, "num_input_tokens_seen": 5044536, "step": 26465 }, { "epoch": 13.757796257796258, "grad_norm": 0.26018157601356506, "learning_rate": 1.3428131611554651e-05, "loss": 0.0608, "num_input_tokens_seen": 5045400, "step": 26470 }, { "epoch": 13.76039501039501, "grad_norm": 0.9585947394371033, "learning_rate": 1.341808146893654e-05, "loss": 0.1123, "num_input_tokens_seen": 5046328, "step": 26475 }, { "epoch": 13.762993762993762, "grad_norm": 0.8732590079307556, "learning_rate": 1.3408033708995077e-05, "loss": 0.1413, "num_input_tokens_seen": 5047352, "step": 26480 }, { "epoch": 13.765592515592516, "grad_norm": 0.28785499930381775, "learning_rate": 1.339798833379732e-05, "loss": 0.0542, "num_input_tokens_seen": 5048376, "step": 26485 }, { "epoch": 13.768191268191268, "grad_norm": 0.7729218006134033, "learning_rate": 1.3387945345409835e-05, "loss": 0.2674, "num_input_tokens_seen": 5049336, "step": 26490 }, { "epoch": 13.77079002079002, "grad_norm": 0.8703333735466003, "learning_rate": 1.3377904745898711e-05, "loss": 0.1278, "num_input_tokens_seen": 5050360, "step": 26495 }, { "epoch": 13.773388773388774, "grad_norm": 0.7321739196777344, "learning_rate": 1.3367866537329533e-05, "loss": 0.1675, "num_input_tokens_seen": 5051384, "step": 26500 }, { "epoch": 13.775987525987526, "grad_norm": 0.24283501505851746, "learning_rate": 1.3357830721767401e-05, "loss": 0.1707, "num_input_tokens_seen": 5052280, "step": 26505 }, { "epoch": 13.778586278586278, "grad_norm": 0.3998665511608124, "learning_rate": 1.3347797301276915e-05, "loss": 0.0817, "num_input_tokens_seen": 5053208, "step": 26510 }, { "epoch": 13.78118503118503, "grad_norm": 1.2545802593231201, "learning_rate": 1.3337766277922203e-05, "loss": 0.1037, "num_input_tokens_seen": 5054168, "step": 26515 }, { "epoch": 13.783783783783784, "grad_norm": 0.13223294913768768, "learning_rate": 1.332773765376688e-05, "loss": 0.0656, "num_input_tokens_seen": 5055096, "step": 26520 }, { "epoch": 13.786382536382536, "grad_norm": 0.18724286556243896, "learning_rate": 1.3317711430874069e-05, "loss": 0.0947, "num_input_tokens_seen": 5056056, "step": 26525 }, { "epoch": 13.788981288981288, "grad_norm": 0.5939122438430786, "learning_rate": 1.3307687611306396e-05, "loss": 0.0999, "num_input_tokens_seen": 5056984, "step": 26530 }, { "epoch": 13.791580041580042, "grad_norm": 0.1486549973487854, "learning_rate": 1.3297666197126019e-05, "loss": 0.0366, "num_input_tokens_seen": 5057944, "step": 26535 }, { "epoch": 13.794178794178794, "grad_norm": 0.0927620455622673, "learning_rate": 1.3287647190394573e-05, "loss": 0.0253, "num_input_tokens_seen": 5058872, "step": 26540 }, { "epoch": 13.796777546777546, "grad_norm": 0.5823346376419067, "learning_rate": 1.3277630593173204e-05, "loss": 0.1046, "num_input_tokens_seen": 5059800, "step": 26545 }, { "epoch": 13.799376299376299, "grad_norm": 0.16994650661945343, "learning_rate": 1.3267616407522554e-05, "loss": 0.1696, "num_input_tokens_seen": 5060760, "step": 26550 }, { "epoch": 13.801975051975052, "grad_norm": 0.5213726758956909, "learning_rate": 1.3257604635502804e-05, "loss": 0.0809, "num_input_tokens_seen": 5061720, "step": 26555 }, { "epoch": 13.804573804573804, "grad_norm": 0.34625256061553955, "learning_rate": 1.324759527917361e-05, "loss": 0.1232, "num_input_tokens_seen": 5062712, "step": 26560 }, { "epoch": 13.807172557172557, "grad_norm": 1.2596864700317383, "learning_rate": 1.3237588340594109e-05, "loss": 0.1902, "num_input_tokens_seen": 5063640, "step": 26565 }, { "epoch": 13.80977130977131, "grad_norm": 2.596099853515625, "learning_rate": 1.3227583821822992e-05, "loss": 0.2776, "num_input_tokens_seen": 5064568, "step": 26570 }, { "epoch": 13.812370062370062, "grad_norm": 0.5212703347206116, "learning_rate": 1.3217581724918418e-05, "loss": 0.0511, "num_input_tokens_seen": 5065528, "step": 26575 }, { "epoch": 13.814968814968815, "grad_norm": 1.1110966205596924, "learning_rate": 1.320758205193805e-05, "loss": 0.0863, "num_input_tokens_seen": 5066456, "step": 26580 }, { "epoch": 13.817567567567568, "grad_norm": 1.2179880142211914, "learning_rate": 1.3197584804939076e-05, "loss": 0.2028, "num_input_tokens_seen": 5067384, "step": 26585 }, { "epoch": 13.82016632016632, "grad_norm": 1.2975850105285645, "learning_rate": 1.3187589985978157e-05, "loss": 0.1273, "num_input_tokens_seen": 5068248, "step": 26590 }, { "epoch": 13.822765072765073, "grad_norm": 0.1481197476387024, "learning_rate": 1.3177597597111469e-05, "loss": 0.1525, "num_input_tokens_seen": 5069208, "step": 26595 }, { "epoch": 13.825363825363825, "grad_norm": 0.9349840879440308, "learning_rate": 1.3167607640394674e-05, "loss": 0.0725, "num_input_tokens_seen": 5070232, "step": 26600 }, { "epoch": 13.827962577962579, "grad_norm": 0.5889196991920471, "learning_rate": 1.3157620117882963e-05, "loss": 0.0904, "num_input_tokens_seen": 5071288, "step": 26605 }, { "epoch": 13.83056133056133, "grad_norm": 0.27786344289779663, "learning_rate": 1.3147635031630995e-05, "loss": 0.0955, "num_input_tokens_seen": 5072280, "step": 26610 }, { "epoch": 13.833160083160083, "grad_norm": 1.302756667137146, "learning_rate": 1.3137652383692945e-05, "loss": 0.1483, "num_input_tokens_seen": 5073272, "step": 26615 }, { "epoch": 13.835758835758837, "grad_norm": 0.2928885817527771, "learning_rate": 1.3127672176122466e-05, "loss": 0.0862, "num_input_tokens_seen": 5074200, "step": 26620 }, { "epoch": 13.838357588357589, "grad_norm": 1.0967495441436768, "learning_rate": 1.3117694410972748e-05, "loss": 0.1855, "num_input_tokens_seen": 5075064, "step": 26625 }, { "epoch": 13.84095634095634, "grad_norm": 0.10137856006622314, "learning_rate": 1.3107719090296455e-05, "loss": 0.1243, "num_input_tokens_seen": 5075960, "step": 26630 }, { "epoch": 13.843555093555093, "grad_norm": 0.8339201211929321, "learning_rate": 1.3097746216145721e-05, "loss": 0.0894, "num_input_tokens_seen": 5076888, "step": 26635 }, { "epoch": 13.846153846153847, "grad_norm": 0.40684670209884644, "learning_rate": 1.3087775790572227e-05, "loss": 0.2292, "num_input_tokens_seen": 5077848, "step": 26640 }, { "epoch": 13.848752598752599, "grad_norm": 0.4124096632003784, "learning_rate": 1.3077807815627125e-05, "loss": 0.1594, "num_input_tokens_seen": 5078904, "step": 26645 }, { "epoch": 13.85135135135135, "grad_norm": 0.682343065738678, "learning_rate": 1.3067842293361054e-05, "loss": 0.1545, "num_input_tokens_seen": 5079864, "step": 26650 }, { "epoch": 13.853950103950105, "grad_norm": 0.41927585005760193, "learning_rate": 1.305787922582416e-05, "loss": 0.0577, "num_input_tokens_seen": 5080792, "step": 26655 }, { "epoch": 13.856548856548857, "grad_norm": 2.1157140731811523, "learning_rate": 1.3047918615066102e-05, "loss": 0.2748, "num_input_tokens_seen": 5081752, "step": 26660 }, { "epoch": 13.859147609147609, "grad_norm": 0.0876225084066391, "learning_rate": 1.3037960463136001e-05, "loss": 0.0341, "num_input_tokens_seen": 5082808, "step": 26665 }, { "epoch": 13.861746361746361, "grad_norm": 0.18239158391952515, "learning_rate": 1.3028004772082485e-05, "loss": 0.1353, "num_input_tokens_seen": 5083704, "step": 26670 }, { "epoch": 13.864345114345115, "grad_norm": 0.2357826977968216, "learning_rate": 1.3018051543953674e-05, "loss": 0.0693, "num_input_tokens_seen": 5084696, "step": 26675 }, { "epoch": 13.866943866943867, "grad_norm": 1.174798846244812, "learning_rate": 1.3008100780797197e-05, "loss": 0.0883, "num_input_tokens_seen": 5085688, "step": 26680 }, { "epoch": 13.869542619542619, "grad_norm": 0.3473937511444092, "learning_rate": 1.299815248466016e-05, "loss": 0.1651, "num_input_tokens_seen": 5086616, "step": 26685 }, { "epoch": 13.872141372141373, "grad_norm": 0.02666856162250042, "learning_rate": 1.2988206657589153e-05, "loss": 0.0356, "num_input_tokens_seen": 5087512, "step": 26690 }, { "epoch": 13.874740124740125, "grad_norm": 0.05667691305279732, "learning_rate": 1.2978263301630284e-05, "loss": 0.1561, "num_input_tokens_seen": 5088472, "step": 26695 }, { "epoch": 13.877338877338877, "grad_norm": 0.15237338840961456, "learning_rate": 1.2968322418829126e-05, "loss": 0.1193, "num_input_tokens_seen": 5089464, "step": 26700 }, { "epoch": 13.87993762993763, "grad_norm": 0.5171093940734863, "learning_rate": 1.2958384011230756e-05, "loss": 0.0842, "num_input_tokens_seen": 5090488, "step": 26705 }, { "epoch": 13.882536382536383, "grad_norm": 0.6222342848777771, "learning_rate": 1.2948448080879755e-05, "loss": 0.153, "num_input_tokens_seen": 5091608, "step": 26710 }, { "epoch": 13.885135135135135, "grad_norm": 0.3578874468803406, "learning_rate": 1.293851462982017e-05, "loss": 0.2348, "num_input_tokens_seen": 5092504, "step": 26715 }, { "epoch": 13.887733887733887, "grad_norm": 0.6376312375068665, "learning_rate": 1.292858366009555e-05, "loss": 0.1423, "num_input_tokens_seen": 5093432, "step": 26720 }, { "epoch": 13.890332640332641, "grad_norm": 0.8237974643707275, "learning_rate": 1.2918655173748925e-05, "loss": 0.1039, "num_input_tokens_seen": 5094392, "step": 26725 }, { "epoch": 13.892931392931393, "grad_norm": 0.883104145526886, "learning_rate": 1.290872917282284e-05, "loss": 0.1464, "num_input_tokens_seen": 5095320, "step": 26730 }, { "epoch": 13.895530145530145, "grad_norm": 1.0480889081954956, "learning_rate": 1.2898805659359297e-05, "loss": 0.2147, "num_input_tokens_seen": 5096280, "step": 26735 }, { "epoch": 13.898128898128899, "grad_norm": 0.398088663816452, "learning_rate": 1.2888884635399801e-05, "loss": 0.101, "num_input_tokens_seen": 5097240, "step": 26740 }, { "epoch": 13.900727650727651, "grad_norm": 0.7054910063743591, "learning_rate": 1.2878966102985332e-05, "loss": 0.0576, "num_input_tokens_seen": 5098200, "step": 26745 }, { "epoch": 13.903326403326403, "grad_norm": 0.3903743028640747, "learning_rate": 1.286905006415639e-05, "loss": 0.0558, "num_input_tokens_seen": 5099096, "step": 26750 }, { "epoch": 13.905925155925155, "grad_norm": 1.7019374370574951, "learning_rate": 1.285913652095293e-05, "loss": 0.2519, "num_input_tokens_seen": 5100120, "step": 26755 }, { "epoch": 13.90852390852391, "grad_norm": 0.6085997223854065, "learning_rate": 1.2849225475414407e-05, "loss": 0.0541, "num_input_tokens_seen": 5101016, "step": 26760 }, { "epoch": 13.911122661122661, "grad_norm": 0.3930060863494873, "learning_rate": 1.283931692957975e-05, "loss": 0.0531, "num_input_tokens_seen": 5101976, "step": 26765 }, { "epoch": 13.913721413721413, "grad_norm": 0.6639934182167053, "learning_rate": 1.2829410885487391e-05, "loss": 0.1639, "num_input_tokens_seen": 5102904, "step": 26770 }, { "epoch": 13.916320166320165, "grad_norm": 0.09997064620256424, "learning_rate": 1.281950734517524e-05, "loss": 0.1153, "num_input_tokens_seen": 5103832, "step": 26775 }, { "epoch": 13.91891891891892, "grad_norm": 1.4734857082366943, "learning_rate": 1.280960631068068e-05, "loss": 0.1322, "num_input_tokens_seen": 5104792, "step": 26780 }, { "epoch": 13.921517671517671, "grad_norm": 1.1655329465866089, "learning_rate": 1.2799707784040608e-05, "loss": 0.1903, "num_input_tokens_seen": 5105720, "step": 26785 }, { "epoch": 13.924116424116423, "grad_norm": 0.42828893661499023, "learning_rate": 1.2789811767291376e-05, "loss": 0.1562, "num_input_tokens_seen": 5106616, "step": 26790 }, { "epoch": 13.926715176715177, "grad_norm": 0.1125725582242012, "learning_rate": 1.2779918262468833e-05, "loss": 0.0434, "num_input_tokens_seen": 5107576, "step": 26795 }, { "epoch": 13.92931392931393, "grad_norm": 0.17357905209064484, "learning_rate": 1.2770027271608298e-05, "loss": 0.1256, "num_input_tokens_seen": 5108536, "step": 26800 }, { "epoch": 13.931912681912682, "grad_norm": 1.1097400188446045, "learning_rate": 1.27601387967446e-05, "loss": 0.1403, "num_input_tokens_seen": 5109496, "step": 26805 }, { "epoch": 13.934511434511435, "grad_norm": 0.8449452519416809, "learning_rate": 1.275025283991203e-05, "loss": 0.2656, "num_input_tokens_seen": 5110424, "step": 26810 }, { "epoch": 13.937110187110187, "grad_norm": 0.10148847848176956, "learning_rate": 1.2740369403144358e-05, "loss": 0.0456, "num_input_tokens_seen": 5111416, "step": 26815 }, { "epoch": 13.93970893970894, "grad_norm": 0.7747237682342529, "learning_rate": 1.2730488488474843e-05, "loss": 0.062, "num_input_tokens_seen": 5112280, "step": 26820 }, { "epoch": 13.942307692307692, "grad_norm": 0.046123404055833817, "learning_rate": 1.2720610097936242e-05, "loss": 0.0549, "num_input_tokens_seen": 5113240, "step": 26825 }, { "epoch": 13.944906444906445, "grad_norm": 0.16321703791618347, "learning_rate": 1.2710734233560744e-05, "loss": 0.0808, "num_input_tokens_seen": 5114232, "step": 26830 }, { "epoch": 13.947505197505198, "grad_norm": 0.3110121786594391, "learning_rate": 1.2700860897380079e-05, "loss": 0.1727, "num_input_tokens_seen": 5115192, "step": 26835 }, { "epoch": 13.95010395010395, "grad_norm": 0.22597935795783997, "learning_rate": 1.2690990091425415e-05, "loss": 0.0626, "num_input_tokens_seen": 5116152, "step": 26840 }, { "epoch": 13.952702702702704, "grad_norm": 0.9044389724731445, "learning_rate": 1.268112181772741e-05, "loss": 0.1894, "num_input_tokens_seen": 5117144, "step": 26845 }, { "epoch": 13.955301455301456, "grad_norm": 1.1421400308609009, "learning_rate": 1.2671256078316197e-05, "loss": 0.098, "num_input_tokens_seen": 5118104, "step": 26850 }, { "epoch": 13.957900207900208, "grad_norm": 1.3031715154647827, "learning_rate": 1.266139287522141e-05, "loss": 0.2381, "num_input_tokens_seen": 5119160, "step": 26855 }, { "epoch": 13.96049896049896, "grad_norm": 0.10158317536115646, "learning_rate": 1.265153221047214e-05, "loss": 0.1206, "num_input_tokens_seen": 5120152, "step": 26860 }, { "epoch": 13.963097713097714, "grad_norm": 1.0891022682189941, "learning_rate": 1.2641674086096955e-05, "loss": 0.1541, "num_input_tokens_seen": 5121080, "step": 26865 }, { "epoch": 13.965696465696466, "grad_norm": 1.4411927461624146, "learning_rate": 1.2631818504123902e-05, "loss": 0.1904, "num_input_tokens_seen": 5122040, "step": 26870 }, { "epoch": 13.968295218295218, "grad_norm": 0.6669436693191528, "learning_rate": 1.2621965466580525e-05, "loss": 0.157, "num_input_tokens_seen": 5123032, "step": 26875 }, { "epoch": 13.970893970893972, "grad_norm": 1.3136314153671265, "learning_rate": 1.2612114975493819e-05, "loss": 0.1642, "num_input_tokens_seen": 5123992, "step": 26880 }, { "epoch": 13.973492723492724, "grad_norm": 0.08811653405427933, "learning_rate": 1.2602267032890267e-05, "loss": 0.132, "num_input_tokens_seen": 5124952, "step": 26885 }, { "epoch": 13.976091476091476, "grad_norm": 0.13707689940929413, "learning_rate": 1.2592421640795821e-05, "loss": 0.0697, "num_input_tokens_seen": 5125912, "step": 26890 }, { "epoch": 13.978690228690228, "grad_norm": 0.1302364021539688, "learning_rate": 1.2582578801235917e-05, "loss": 0.2207, "num_input_tokens_seen": 5126808, "step": 26895 }, { "epoch": 13.981288981288982, "grad_norm": 1.0653352737426758, "learning_rate": 1.2572738516235461e-05, "loss": 0.0839, "num_input_tokens_seen": 5127768, "step": 26900 }, { "epoch": 13.983887733887734, "grad_norm": 0.3309744596481323, "learning_rate": 1.2562900787818827e-05, "loss": 0.2658, "num_input_tokens_seen": 5128696, "step": 26905 }, { "epoch": 13.986486486486486, "grad_norm": 0.09119761735200882, "learning_rate": 1.2553065618009883e-05, "loss": 0.2143, "num_input_tokens_seen": 5129592, "step": 26910 }, { "epoch": 13.98908523908524, "grad_norm": 0.15872704982757568, "learning_rate": 1.254323300883195e-05, "loss": 0.1372, "num_input_tokens_seen": 5130488, "step": 26915 }, { "epoch": 13.991683991683992, "grad_norm": 1.0916999578475952, "learning_rate": 1.2533402962307833e-05, "loss": 0.1577, "num_input_tokens_seen": 5131352, "step": 26920 }, { "epoch": 13.994282744282744, "grad_norm": 0.18473665416240692, "learning_rate": 1.2523575480459798e-05, "loss": 0.1237, "num_input_tokens_seen": 5132280, "step": 26925 }, { "epoch": 13.996881496881496, "grad_norm": 0.943466067314148, "learning_rate": 1.2513750565309606e-05, "loss": 0.1519, "num_input_tokens_seen": 5133240, "step": 26930 }, { "epoch": 13.99948024948025, "grad_norm": 1.2858624458312988, "learning_rate": 1.250392821887847e-05, "loss": 0.149, "num_input_tokens_seen": 5134200, "step": 26935 }, { "epoch": 14.0, "eval_loss": 0.1474549025297165, "eval_runtime": 8.0196, "eval_samples_per_second": 106.738, "eval_steps_per_second": 26.684, "num_input_tokens_seen": 5134344, "step": 26936 }, { "epoch": 14.002079002079002, "grad_norm": 0.36838486790657043, "learning_rate": 1.2494108443187072e-05, "loss": 0.0667, "num_input_tokens_seen": 5135144, "step": 26940 }, { "epoch": 14.004677754677754, "grad_norm": 0.1687196046113968, "learning_rate": 1.2484291240255592e-05, "loss": 0.0582, "num_input_tokens_seen": 5136104, "step": 26945 }, { "epoch": 14.007276507276508, "grad_norm": 0.08754031360149384, "learning_rate": 1.247447661210365e-05, "loss": 0.0537, "num_input_tokens_seen": 5137064, "step": 26950 }, { "epoch": 14.00987525987526, "grad_norm": 0.4941779375076294, "learning_rate": 1.2464664560750356e-05, "loss": 0.0493, "num_input_tokens_seen": 5137992, "step": 26955 }, { "epoch": 14.012474012474012, "grad_norm": 0.07350978255271912, "learning_rate": 1.2454855088214274e-05, "loss": 0.1186, "num_input_tokens_seen": 5138952, "step": 26960 }, { "epoch": 14.015072765072764, "grad_norm": 0.08750802278518677, "learning_rate": 1.2445048196513453e-05, "loss": 0.0537, "num_input_tokens_seen": 5139848, "step": 26965 }, { "epoch": 14.017671517671518, "grad_norm": 2.580265998840332, "learning_rate": 1.24352438876654e-05, "loss": 0.226, "num_input_tokens_seen": 5140712, "step": 26970 }, { "epoch": 14.02027027027027, "grad_norm": 0.18227969110012054, "learning_rate": 1.2425442163687092e-05, "loss": 0.0609, "num_input_tokens_seen": 5141640, "step": 26975 }, { "epoch": 14.022869022869022, "grad_norm": 0.3533226251602173, "learning_rate": 1.241564302659499e-05, "loss": 0.0891, "num_input_tokens_seen": 5142632, "step": 26980 }, { "epoch": 14.025467775467776, "grad_norm": 0.871558666229248, "learning_rate": 1.2405846478405001e-05, "loss": 0.1834, "num_input_tokens_seen": 5143656, "step": 26985 }, { "epoch": 14.028066528066528, "grad_norm": 0.6610402464866638, "learning_rate": 1.2396052521132511e-05, "loss": 0.0862, "num_input_tokens_seen": 5144616, "step": 26990 }, { "epoch": 14.03066528066528, "grad_norm": 0.06273629516363144, "learning_rate": 1.2386261156792358e-05, "loss": 0.1106, "num_input_tokens_seen": 5145608, "step": 26995 }, { "epoch": 14.033264033264032, "grad_norm": 1.2379142045974731, "learning_rate": 1.2376472387398883e-05, "loss": 0.2137, "num_input_tokens_seen": 5146504, "step": 27000 }, { "epoch": 14.035862785862786, "grad_norm": 0.9453418850898743, "learning_rate": 1.2366686214965854e-05, "loss": 0.1917, "num_input_tokens_seen": 5147464, "step": 27005 }, { "epoch": 14.038461538461538, "grad_norm": 0.3853246569633484, "learning_rate": 1.235690264150652e-05, "loss": 0.0998, "num_input_tokens_seen": 5148328, "step": 27010 }, { "epoch": 14.04106029106029, "grad_norm": 1.3894047737121582, "learning_rate": 1.234712166903359e-05, "loss": 0.331, "num_input_tokens_seen": 5149320, "step": 27015 }, { "epoch": 14.043659043659044, "grad_norm": 1.3312007188796997, "learning_rate": 1.233734329955927e-05, "loss": 0.1062, "num_input_tokens_seen": 5150216, "step": 27020 }, { "epoch": 14.046257796257796, "grad_norm": 0.20273970067501068, "learning_rate": 1.2327567535095172e-05, "loss": 0.219, "num_input_tokens_seen": 5151176, "step": 27025 }, { "epoch": 14.048856548856548, "grad_norm": 1.5893827676773071, "learning_rate": 1.2317794377652409e-05, "loss": 0.2192, "num_input_tokens_seen": 5152104, "step": 27030 }, { "epoch": 14.051455301455302, "grad_norm": 0.8829306364059448, "learning_rate": 1.2308023829241567e-05, "loss": 0.083, "num_input_tokens_seen": 5153000, "step": 27035 }, { "epoch": 14.054054054054054, "grad_norm": 0.8390406370162964, "learning_rate": 1.2298255891872673e-05, "loss": 0.1165, "num_input_tokens_seen": 5153960, "step": 27040 }, { "epoch": 14.056652806652806, "grad_norm": 0.6660067439079285, "learning_rate": 1.2288490567555225e-05, "loss": 0.0869, "num_input_tokens_seen": 5154952, "step": 27045 }, { "epoch": 14.059251559251559, "grad_norm": 1.2499350309371948, "learning_rate": 1.2278727858298172e-05, "loss": 0.1526, "num_input_tokens_seen": 5155912, "step": 27050 }, { "epoch": 14.061850311850312, "grad_norm": 1.2027498483657837, "learning_rate": 1.2268967766109958e-05, "loss": 0.1865, "num_input_tokens_seen": 5156872, "step": 27055 }, { "epoch": 14.064449064449065, "grad_norm": 0.7586489319801331, "learning_rate": 1.225921029299845e-05, "loss": 0.1922, "num_input_tokens_seen": 5157800, "step": 27060 }, { "epoch": 14.067047817047817, "grad_norm": 0.14956672489643097, "learning_rate": 1.2249455440970989e-05, "loss": 0.0952, "num_input_tokens_seen": 5158664, "step": 27065 }, { "epoch": 14.06964656964657, "grad_norm": 0.6353330612182617, "learning_rate": 1.22397032120344e-05, "loss": 0.042, "num_input_tokens_seen": 5159592, "step": 27070 }, { "epoch": 14.072245322245323, "grad_norm": 0.6723522543907166, "learning_rate": 1.2229953608194938e-05, "loss": 0.0838, "num_input_tokens_seen": 5160552, "step": 27075 }, { "epoch": 14.074844074844075, "grad_norm": 0.11444002389907837, "learning_rate": 1.2220206631458327e-05, "loss": 0.1992, "num_input_tokens_seen": 5161448, "step": 27080 }, { "epoch": 14.077442827442827, "grad_norm": 0.14853495359420776, "learning_rate": 1.2210462283829755e-05, "loss": 0.1529, "num_input_tokens_seen": 5162376, "step": 27085 }, { "epoch": 14.08004158004158, "grad_norm": 0.7330371737480164, "learning_rate": 1.2200720567313863e-05, "loss": 0.2526, "num_input_tokens_seen": 5163336, "step": 27090 }, { "epoch": 14.082640332640333, "grad_norm": 0.7918946743011475, "learning_rate": 1.2190981483914763e-05, "loss": 0.1044, "num_input_tokens_seen": 5164328, "step": 27095 }, { "epoch": 14.085239085239085, "grad_norm": 1.1215962171554565, "learning_rate": 1.2181245035635997e-05, "loss": 0.068, "num_input_tokens_seen": 5165256, "step": 27100 }, { "epoch": 14.087837837837839, "grad_norm": 0.13265596330165863, "learning_rate": 1.2171511224480606e-05, "loss": 0.0664, "num_input_tokens_seen": 5166184, "step": 27105 }, { "epoch": 14.09043659043659, "grad_norm": 0.2136898934841156, "learning_rate": 1.2161780052451061e-05, "loss": 0.1421, "num_input_tokens_seen": 5167144, "step": 27110 }, { "epoch": 14.093035343035343, "grad_norm": 0.938709557056427, "learning_rate": 1.215205152154929e-05, "loss": 0.1057, "num_input_tokens_seen": 5168072, "step": 27115 }, { "epoch": 14.095634095634095, "grad_norm": 0.17167283594608307, "learning_rate": 1.2142325633776679e-05, "loss": 0.1037, "num_input_tokens_seen": 5169032, "step": 27120 }, { "epoch": 14.098232848232849, "grad_norm": 0.14761407673358917, "learning_rate": 1.2132602391134096e-05, "loss": 0.1741, "num_input_tokens_seen": 5169928, "step": 27125 }, { "epoch": 14.1008316008316, "grad_norm": 1.5934752225875854, "learning_rate": 1.2122881795621829e-05, "loss": 0.2289, "num_input_tokens_seen": 5170920, "step": 27130 }, { "epoch": 14.103430353430353, "grad_norm": 0.7531123757362366, "learning_rate": 1.2113163849239636e-05, "loss": 0.0449, "num_input_tokens_seen": 5171816, "step": 27135 }, { "epoch": 14.106029106029107, "grad_norm": 1.066642165184021, "learning_rate": 1.2103448553986723e-05, "loss": 0.1261, "num_input_tokens_seen": 5172840, "step": 27140 }, { "epoch": 14.108627858627859, "grad_norm": 0.49545174837112427, "learning_rate": 1.2093735911861778e-05, "loss": 0.063, "num_input_tokens_seen": 5173800, "step": 27145 }, { "epoch": 14.111226611226611, "grad_norm": 1.4197382926940918, "learning_rate": 1.2084025924862922e-05, "loss": 0.1168, "num_input_tokens_seen": 5174760, "step": 27150 }, { "epoch": 14.113825363825363, "grad_norm": 0.29521384835243225, "learning_rate": 1.2074318594987702e-05, "loss": 0.0826, "num_input_tokens_seen": 5175752, "step": 27155 }, { "epoch": 14.116424116424117, "grad_norm": 0.6908204555511475, "learning_rate": 1.2064613924233174e-05, "loss": 0.0918, "num_input_tokens_seen": 5176616, "step": 27160 }, { "epoch": 14.119022869022869, "grad_norm": 1.6113225221633911, "learning_rate": 1.2054911914595809e-05, "loss": 0.1299, "num_input_tokens_seen": 5177576, "step": 27165 }, { "epoch": 14.121621621621621, "grad_norm": 0.5538796782493591, "learning_rate": 1.2045212568071549e-05, "loss": 0.1617, "num_input_tokens_seen": 5178536, "step": 27170 }, { "epoch": 14.124220374220375, "grad_norm": 0.5291311144828796, "learning_rate": 1.2035515886655762e-05, "loss": 0.2196, "num_input_tokens_seen": 5179560, "step": 27175 }, { "epoch": 14.126819126819127, "grad_norm": 0.2935788929462433, "learning_rate": 1.2025821872343313e-05, "loss": 0.173, "num_input_tokens_seen": 5180488, "step": 27180 }, { "epoch": 14.129417879417879, "grad_norm": 0.10759720951318741, "learning_rate": 1.2016130527128477e-05, "loss": 0.1957, "num_input_tokens_seen": 5181416, "step": 27185 }, { "epoch": 14.132016632016631, "grad_norm": 0.6531254649162292, "learning_rate": 1.2006441853004988e-05, "loss": 0.0662, "num_input_tokens_seen": 5182344, "step": 27190 }, { "epoch": 14.134615384615385, "grad_norm": 0.5577284693717957, "learning_rate": 1.1996755851966054e-05, "loss": 0.0728, "num_input_tokens_seen": 5183304, "step": 27195 }, { "epoch": 14.137214137214137, "grad_norm": 1.0119822025299072, "learning_rate": 1.1987072526004312e-05, "loss": 0.1073, "num_input_tokens_seen": 5184232, "step": 27200 }, { "epoch": 14.13981288981289, "grad_norm": 0.7917800545692444, "learning_rate": 1.1977391877111846e-05, "loss": 0.1023, "num_input_tokens_seen": 5185192, "step": 27205 }, { "epoch": 14.142411642411643, "grad_norm": 0.04592296853661537, "learning_rate": 1.196771390728019e-05, "loss": 0.0369, "num_input_tokens_seen": 5186120, "step": 27210 }, { "epoch": 14.145010395010395, "grad_norm": 1.458072543144226, "learning_rate": 1.1958038618500362e-05, "loss": 0.127, "num_input_tokens_seen": 5187048, "step": 27215 }, { "epoch": 14.147609147609147, "grad_norm": 0.7735533118247986, "learning_rate": 1.1948366012762772e-05, "loss": 0.2501, "num_input_tokens_seen": 5188040, "step": 27220 }, { "epoch": 14.1502079002079, "grad_norm": 0.28680580854415894, "learning_rate": 1.19386960920573e-05, "loss": 0.1262, "num_input_tokens_seen": 5188968, "step": 27225 }, { "epoch": 14.152806652806653, "grad_norm": 0.5156967639923096, "learning_rate": 1.1929028858373303e-05, "loss": 0.0743, "num_input_tokens_seen": 5189896, "step": 27230 }, { "epoch": 14.155405405405405, "grad_norm": 0.973950982093811, "learning_rate": 1.1919364313699551e-05, "loss": 0.1023, "num_input_tokens_seen": 5190888, "step": 27235 }, { "epoch": 14.158004158004157, "grad_norm": 1.7772681713104248, "learning_rate": 1.1909702460024272e-05, "loss": 0.1605, "num_input_tokens_seen": 5191880, "step": 27240 }, { "epoch": 14.160602910602911, "grad_norm": 0.7907083034515381, "learning_rate": 1.1900043299335129e-05, "loss": 0.1532, "num_input_tokens_seen": 5192840, "step": 27245 }, { "epoch": 14.163201663201663, "grad_norm": 0.6452680230140686, "learning_rate": 1.1890386833619258e-05, "loss": 0.2025, "num_input_tokens_seen": 5193832, "step": 27250 }, { "epoch": 14.165800415800415, "grad_norm": 1.0665404796600342, "learning_rate": 1.188073306486322e-05, "loss": 0.1288, "num_input_tokens_seen": 5194664, "step": 27255 }, { "epoch": 14.16839916839917, "grad_norm": 0.10568959265947342, "learning_rate": 1.1871081995053019e-05, "loss": 0.1241, "num_input_tokens_seen": 5195528, "step": 27260 }, { "epoch": 14.170997920997921, "grad_norm": 0.5344493985176086, "learning_rate": 1.1861433626174107e-05, "loss": 0.1814, "num_input_tokens_seen": 5196488, "step": 27265 }, { "epoch": 14.173596673596673, "grad_norm": 0.755229651927948, "learning_rate": 1.1851787960211398e-05, "loss": 0.105, "num_input_tokens_seen": 5197448, "step": 27270 }, { "epoch": 14.176195426195425, "grad_norm": 0.21932873129844666, "learning_rate": 1.1842144999149226e-05, "loss": 0.0779, "num_input_tokens_seen": 5198504, "step": 27275 }, { "epoch": 14.17879417879418, "grad_norm": 0.6586388945579529, "learning_rate": 1.1832504744971379e-05, "loss": 0.083, "num_input_tokens_seen": 5199432, "step": 27280 }, { "epoch": 14.181392931392931, "grad_norm": 1.0481926202774048, "learning_rate": 1.182286719966109e-05, "loss": 0.2197, "num_input_tokens_seen": 5200392, "step": 27285 }, { "epoch": 14.183991683991684, "grad_norm": 0.6794895529747009, "learning_rate": 1.1813232365201025e-05, "loss": 0.1615, "num_input_tokens_seen": 5201384, "step": 27290 }, { "epoch": 14.186590436590437, "grad_norm": 0.7540923953056335, "learning_rate": 1.1803600243573304e-05, "loss": 0.0729, "num_input_tokens_seen": 5202312, "step": 27295 }, { "epoch": 14.18918918918919, "grad_norm": 0.9221634268760681, "learning_rate": 1.1793970836759475e-05, "loss": 0.1108, "num_input_tokens_seen": 5203304, "step": 27300 }, { "epoch": 14.191787941787942, "grad_norm": 0.3323630690574646, "learning_rate": 1.178434414674055e-05, "loss": 0.0844, "num_input_tokens_seen": 5204168, "step": 27305 }, { "epoch": 14.194386694386694, "grad_norm": 0.791296660900116, "learning_rate": 1.1774720175496967e-05, "loss": 0.2169, "num_input_tokens_seen": 5205224, "step": 27310 }, { "epoch": 14.196985446985448, "grad_norm": 0.14707159996032715, "learning_rate": 1.176509892500859e-05, "loss": 0.1373, "num_input_tokens_seen": 5206152, "step": 27315 }, { "epoch": 14.1995841995842, "grad_norm": 0.191881000995636, "learning_rate": 1.1755480397254762e-05, "loss": 0.218, "num_input_tokens_seen": 5207144, "step": 27320 }, { "epoch": 14.202182952182952, "grad_norm": 0.7161228060722351, "learning_rate": 1.1745864594214231e-05, "loss": 0.115, "num_input_tokens_seen": 5208168, "step": 27325 }, { "epoch": 14.204781704781706, "grad_norm": 0.40036535263061523, "learning_rate": 1.17362515178652e-05, "loss": 0.084, "num_input_tokens_seen": 5209128, "step": 27330 }, { "epoch": 14.207380457380458, "grad_norm": 1.0971583127975464, "learning_rate": 1.1726641170185295e-05, "loss": 0.1314, "num_input_tokens_seen": 5210152, "step": 27335 }, { "epoch": 14.20997920997921, "grad_norm": 0.08751611411571503, "learning_rate": 1.1717033553151615e-05, "loss": 0.0909, "num_input_tokens_seen": 5211112, "step": 27340 }, { "epoch": 14.212577962577962, "grad_norm": 0.14209312200546265, "learning_rate": 1.1707428668740678e-05, "loss": 0.0905, "num_input_tokens_seen": 5212104, "step": 27345 }, { "epoch": 14.215176715176716, "grad_norm": 0.9392570853233337, "learning_rate": 1.1697826518928406e-05, "loss": 0.0746, "num_input_tokens_seen": 5213064, "step": 27350 }, { "epoch": 14.217775467775468, "grad_norm": 0.2627524137496948, "learning_rate": 1.1688227105690216e-05, "loss": 0.1369, "num_input_tokens_seen": 5213960, "step": 27355 }, { "epoch": 14.22037422037422, "grad_norm": 0.3752031922340393, "learning_rate": 1.167863043100093e-05, "loss": 0.1088, "num_input_tokens_seen": 5214824, "step": 27360 }, { "epoch": 14.222972972972974, "grad_norm": 0.9508664608001709, "learning_rate": 1.1669036496834815e-05, "loss": 0.1088, "num_input_tokens_seen": 5215784, "step": 27365 }, { "epoch": 14.225571725571726, "grad_norm": 0.8640248775482178, "learning_rate": 1.165944530516556e-05, "loss": 0.0961, "num_input_tokens_seen": 5216776, "step": 27370 }, { "epoch": 14.228170478170478, "grad_norm": 0.10714223980903625, "learning_rate": 1.1649856857966318e-05, "loss": 0.1469, "num_input_tokens_seen": 5217640, "step": 27375 }, { "epoch": 14.23076923076923, "grad_norm": 0.4243513345718384, "learning_rate": 1.1640271157209654e-05, "loss": 0.0627, "num_input_tokens_seen": 5218568, "step": 27380 }, { "epoch": 14.233367983367984, "grad_norm": 0.09193158894777298, "learning_rate": 1.1630688204867574e-05, "loss": 0.1268, "num_input_tokens_seen": 5219528, "step": 27385 }, { "epoch": 14.235966735966736, "grad_norm": 0.45511648058891296, "learning_rate": 1.1621108002911513e-05, "loss": 0.1777, "num_input_tokens_seen": 5220456, "step": 27390 }, { "epoch": 14.238565488565488, "grad_norm": 0.32024824619293213, "learning_rate": 1.1611530553312363e-05, "loss": 0.2547, "num_input_tokens_seen": 5221480, "step": 27395 }, { "epoch": 14.241164241164242, "grad_norm": 0.5371700525283813, "learning_rate": 1.1601955858040425e-05, "loss": 0.1807, "num_input_tokens_seen": 5222440, "step": 27400 }, { "epoch": 14.243762993762994, "grad_norm": 1.4255719184875488, "learning_rate": 1.1592383919065442e-05, "loss": 0.1889, "num_input_tokens_seen": 5223464, "step": 27405 }, { "epoch": 14.246361746361746, "grad_norm": 0.11025887727737427, "learning_rate": 1.1582814738356579e-05, "loss": 0.1845, "num_input_tokens_seen": 5224392, "step": 27410 }, { "epoch": 14.248960498960498, "grad_norm": 0.5476245284080505, "learning_rate": 1.1573248317882476e-05, "loss": 0.0823, "num_input_tokens_seen": 5225320, "step": 27415 }, { "epoch": 14.251559251559252, "grad_norm": 0.6077117323875427, "learning_rate": 1.1563684659611143e-05, "loss": 0.1299, "num_input_tokens_seen": 5226344, "step": 27420 }, { "epoch": 14.254158004158004, "grad_norm": 0.2833963632583618, "learning_rate": 1.1554123765510056e-05, "loss": 0.1574, "num_input_tokens_seen": 5227272, "step": 27425 }, { "epoch": 14.256756756756756, "grad_norm": 0.7987180948257446, "learning_rate": 1.1544565637546131e-05, "loss": 0.188, "num_input_tokens_seen": 5228264, "step": 27430 }, { "epoch": 14.25935550935551, "grad_norm": 0.4291248023509979, "learning_rate": 1.1535010277685699e-05, "loss": 0.0783, "num_input_tokens_seen": 5229192, "step": 27435 }, { "epoch": 14.261954261954262, "grad_norm": 0.8327137231826782, "learning_rate": 1.1525457687894512e-05, "loss": 0.0739, "num_input_tokens_seen": 5230152, "step": 27440 }, { "epoch": 14.264553014553014, "grad_norm": 0.08749786764383316, "learning_rate": 1.1515907870137787e-05, "loss": 0.0383, "num_input_tokens_seen": 5231080, "step": 27445 }, { "epoch": 14.267151767151766, "grad_norm": 0.3455516993999481, "learning_rate": 1.1506360826380139e-05, "loss": 0.069, "num_input_tokens_seen": 5231976, "step": 27450 }, { "epoch": 14.26975051975052, "grad_norm": 0.3457672894001007, "learning_rate": 1.1496816558585622e-05, "loss": 0.1298, "num_input_tokens_seen": 5232936, "step": 27455 }, { "epoch": 14.272349272349272, "grad_norm": 0.9228883385658264, "learning_rate": 1.1487275068717709e-05, "loss": 0.121, "num_input_tokens_seen": 5233896, "step": 27460 }, { "epoch": 14.274948024948024, "grad_norm": 0.737529456615448, "learning_rate": 1.1477736358739333e-05, "loss": 0.1014, "num_input_tokens_seen": 5234856, "step": 27465 }, { "epoch": 14.277546777546778, "grad_norm": 0.48568132519721985, "learning_rate": 1.1468200430612822e-05, "loss": 0.1644, "num_input_tokens_seen": 5235816, "step": 27470 }, { "epoch": 14.28014553014553, "grad_norm": 0.49269431829452515, "learning_rate": 1.1458667286299942e-05, "loss": 0.2082, "num_input_tokens_seen": 5236808, "step": 27475 }, { "epoch": 14.282744282744282, "grad_norm": 0.7297324538230896, "learning_rate": 1.1449136927761898e-05, "loss": 0.0644, "num_input_tokens_seen": 5237800, "step": 27480 }, { "epoch": 14.285343035343036, "grad_norm": 0.43774107098579407, "learning_rate": 1.1439609356959299e-05, "loss": 0.133, "num_input_tokens_seen": 5238824, "step": 27485 }, { "epoch": 14.287941787941788, "grad_norm": 0.2257765680551529, "learning_rate": 1.1430084575852199e-05, "loss": 0.179, "num_input_tokens_seen": 5239816, "step": 27490 }, { "epoch": 14.29054054054054, "grad_norm": 0.7757346630096436, "learning_rate": 1.1420562586400066e-05, "loss": 0.0842, "num_input_tokens_seen": 5240744, "step": 27495 }, { "epoch": 14.293139293139292, "grad_norm": 0.6655633449554443, "learning_rate": 1.1411043390561812e-05, "loss": 0.1962, "num_input_tokens_seen": 5241704, "step": 27500 }, { "epoch": 14.295738045738046, "grad_norm": 0.45131203532218933, "learning_rate": 1.140152699029576e-05, "loss": 0.1665, "num_input_tokens_seen": 5242760, "step": 27505 }, { "epoch": 14.298336798336798, "grad_norm": 0.6246790885925293, "learning_rate": 1.1392013387559658e-05, "loss": 0.2016, "num_input_tokens_seen": 5243656, "step": 27510 }, { "epoch": 14.30093555093555, "grad_norm": 2.1244781017303467, "learning_rate": 1.138250258431067e-05, "loss": 0.1937, "num_input_tokens_seen": 5244584, "step": 27515 }, { "epoch": 14.303534303534304, "grad_norm": 1.1132885217666626, "learning_rate": 1.1372994582505415e-05, "loss": 0.2726, "num_input_tokens_seen": 5245544, "step": 27520 }, { "epoch": 14.306133056133056, "grad_norm": 0.2908672094345093, "learning_rate": 1.1363489384099901e-05, "loss": 0.1194, "num_input_tokens_seen": 5246504, "step": 27525 }, { "epoch": 14.308731808731808, "grad_norm": 0.12671388685703278, "learning_rate": 1.1353986991049584e-05, "loss": 0.131, "num_input_tokens_seen": 5247464, "step": 27530 }, { "epoch": 14.31133056133056, "grad_norm": 0.39200180768966675, "learning_rate": 1.1344487405309315e-05, "loss": 0.1982, "num_input_tokens_seen": 5248392, "step": 27535 }, { "epoch": 14.313929313929314, "grad_norm": 0.21536850929260254, "learning_rate": 1.1334990628833406e-05, "loss": 0.1298, "num_input_tokens_seen": 5249352, "step": 27540 }, { "epoch": 14.316528066528067, "grad_norm": 0.13446615636348724, "learning_rate": 1.1325496663575571e-05, "loss": 0.074, "num_input_tokens_seen": 5250472, "step": 27545 }, { "epoch": 14.319126819126819, "grad_norm": 0.28558728098869324, "learning_rate": 1.1316005511488918e-05, "loss": 0.1233, "num_input_tokens_seen": 5251400, "step": 27550 }, { "epoch": 14.321725571725572, "grad_norm": 0.07987614721059799, "learning_rate": 1.130651717452603e-05, "loss": 0.1335, "num_input_tokens_seen": 5252360, "step": 27555 }, { "epoch": 14.324324324324325, "grad_norm": 0.6439270377159119, "learning_rate": 1.1297031654638873e-05, "loss": 0.1371, "num_input_tokens_seen": 5253288, "step": 27560 }, { "epoch": 14.326923076923077, "grad_norm": 0.20283019542694092, "learning_rate": 1.1287548953778837e-05, "loss": 0.116, "num_input_tokens_seen": 5254248, "step": 27565 }, { "epoch": 14.329521829521829, "grad_norm": 1.2619237899780273, "learning_rate": 1.1278069073896757e-05, "loss": 0.1027, "num_input_tokens_seen": 5255144, "step": 27570 }, { "epoch": 14.332120582120583, "grad_norm": 1.499832034111023, "learning_rate": 1.1268592016942864e-05, "loss": 0.1319, "num_input_tokens_seen": 5256136, "step": 27575 }, { "epoch": 14.334719334719335, "grad_norm": 0.1386694312095642, "learning_rate": 1.125911778486681e-05, "loss": 0.2861, "num_input_tokens_seen": 5257128, "step": 27580 }, { "epoch": 14.337318087318087, "grad_norm": 1.5214992761611938, "learning_rate": 1.1249646379617662e-05, "loss": 0.0683, "num_input_tokens_seen": 5258088, "step": 27585 }, { "epoch": 14.33991683991684, "grad_norm": 0.5031083226203918, "learning_rate": 1.1240177803143937e-05, "loss": 0.1304, "num_input_tokens_seen": 5259048, "step": 27590 }, { "epoch": 14.342515592515593, "grad_norm": 0.8284729719161987, "learning_rate": 1.1230712057393533e-05, "loss": 0.107, "num_input_tokens_seen": 5259976, "step": 27595 }, { "epoch": 14.345114345114345, "grad_norm": 1.3873143196105957, "learning_rate": 1.1221249144313778e-05, "loss": 0.1189, "num_input_tokens_seen": 5260904, "step": 27600 }, { "epoch": 14.347713097713097, "grad_norm": 0.10604052245616913, "learning_rate": 1.1211789065851416e-05, "loss": 0.0471, "num_input_tokens_seen": 5261768, "step": 27605 }, { "epoch": 14.35031185031185, "grad_norm": 0.1684332638978958, "learning_rate": 1.120233182395263e-05, "loss": 0.261, "num_input_tokens_seen": 5262664, "step": 27610 }, { "epoch": 14.352910602910603, "grad_norm": 0.10336526483297348, "learning_rate": 1.1192877420562981e-05, "loss": 0.0951, "num_input_tokens_seen": 5263624, "step": 27615 }, { "epoch": 14.355509355509355, "grad_norm": 0.1476699858903885, "learning_rate": 1.1183425857627458e-05, "loss": 0.1251, "num_input_tokens_seen": 5264552, "step": 27620 }, { "epoch": 14.358108108108109, "grad_norm": 0.46384719014167786, "learning_rate": 1.1173977137090494e-05, "loss": 0.1917, "num_input_tokens_seen": 5265480, "step": 27625 }, { "epoch": 14.36070686070686, "grad_norm": 0.7027050852775574, "learning_rate": 1.1164531260895908e-05, "loss": 0.2741, "num_input_tokens_seen": 5266504, "step": 27630 }, { "epoch": 14.363305613305613, "grad_norm": 0.6699409484863281, "learning_rate": 1.115508823098694e-05, "loss": 0.0755, "num_input_tokens_seen": 5267432, "step": 27635 }, { "epoch": 14.365904365904367, "grad_norm": 1.6150603294372559, "learning_rate": 1.1145648049306237e-05, "loss": 0.083, "num_input_tokens_seen": 5268392, "step": 27640 }, { "epoch": 14.368503118503119, "grad_norm": 0.6201978325843811, "learning_rate": 1.1136210717795887e-05, "loss": 0.1685, "num_input_tokens_seen": 5269384, "step": 27645 }, { "epoch": 14.371101871101871, "grad_norm": 1.0257865190505981, "learning_rate": 1.1126776238397368e-05, "loss": 0.2, "num_input_tokens_seen": 5270312, "step": 27650 }, { "epoch": 14.373700623700623, "grad_norm": 0.34702229499816895, "learning_rate": 1.1117344613051574e-05, "loss": 0.1286, "num_input_tokens_seen": 5271336, "step": 27655 }, { "epoch": 14.376299376299377, "grad_norm": 0.4350527822971344, "learning_rate": 1.1107915843698807e-05, "loss": 0.1413, "num_input_tokens_seen": 5272264, "step": 27660 }, { "epoch": 14.378898128898129, "grad_norm": 0.8144375085830688, "learning_rate": 1.1098489932278811e-05, "loss": 0.0847, "num_input_tokens_seen": 5273224, "step": 27665 }, { "epoch": 14.381496881496881, "grad_norm": 1.1365009546279907, "learning_rate": 1.1089066880730705e-05, "loss": 0.0747, "num_input_tokens_seen": 5274184, "step": 27670 }, { "epoch": 14.384095634095633, "grad_norm": 1.993143916130066, "learning_rate": 1.107964669099304e-05, "loss": 0.1074, "num_input_tokens_seen": 5275112, "step": 27675 }, { "epoch": 14.386694386694387, "grad_norm": 0.755243718624115, "learning_rate": 1.1070229365003774e-05, "loss": 0.1004, "num_input_tokens_seen": 5276040, "step": 27680 }, { "epoch": 14.38929313929314, "grad_norm": 0.5355890989303589, "learning_rate": 1.106081490470027e-05, "loss": 0.0599, "num_input_tokens_seen": 5276936, "step": 27685 }, { "epoch": 14.391891891891891, "grad_norm": 0.865784227848053, "learning_rate": 1.1051403312019305e-05, "loss": 0.0833, "num_input_tokens_seen": 5277928, "step": 27690 }, { "epoch": 14.394490644490645, "grad_norm": 0.7136682271957397, "learning_rate": 1.1041994588897079e-05, "loss": 0.1227, "num_input_tokens_seen": 5278824, "step": 27695 }, { "epoch": 14.397089397089397, "grad_norm": 1.2988622188568115, "learning_rate": 1.1032588737269187e-05, "loss": 0.2138, "num_input_tokens_seen": 5279784, "step": 27700 }, { "epoch": 14.39968814968815, "grad_norm": 2.2942628860473633, "learning_rate": 1.1023185759070634e-05, "loss": 0.1709, "num_input_tokens_seen": 5280808, "step": 27705 }, { "epoch": 14.402286902286903, "grad_norm": 0.20249401032924652, "learning_rate": 1.1013785656235828e-05, "loss": 0.0696, "num_input_tokens_seen": 5281768, "step": 27710 }, { "epoch": 14.404885654885655, "grad_norm": 0.7197967171669006, "learning_rate": 1.1004388430698615e-05, "loss": 0.0804, "num_input_tokens_seen": 5282696, "step": 27715 }, { "epoch": 14.407484407484407, "grad_norm": 0.16221211850643158, "learning_rate": 1.0994994084392216e-05, "loss": 0.0255, "num_input_tokens_seen": 5283720, "step": 27720 }, { "epoch": 14.41008316008316, "grad_norm": 1.038964033126831, "learning_rate": 1.0985602619249271e-05, "loss": 0.2286, "num_input_tokens_seen": 5284648, "step": 27725 }, { "epoch": 14.412681912681913, "grad_norm": 0.07557892799377441, "learning_rate": 1.0976214037201821e-05, "loss": 0.1328, "num_input_tokens_seen": 5285544, "step": 27730 }, { "epoch": 14.415280665280665, "grad_norm": 1.3327336311340332, "learning_rate": 1.096682834018134e-05, "loss": 0.1098, "num_input_tokens_seen": 5286504, "step": 27735 }, { "epoch": 14.417879417879417, "grad_norm": 0.1171182319521904, "learning_rate": 1.0957445530118688e-05, "loss": 0.1086, "num_input_tokens_seen": 5287464, "step": 27740 }, { "epoch": 14.420478170478171, "grad_norm": 0.3167717158794403, "learning_rate": 1.0948065608944105e-05, "loss": 0.1836, "num_input_tokens_seen": 5288360, "step": 27745 }, { "epoch": 14.423076923076923, "grad_norm": 1.1694979667663574, "learning_rate": 1.0938688578587294e-05, "loss": 0.1113, "num_input_tokens_seen": 5289320, "step": 27750 }, { "epoch": 14.425675675675675, "grad_norm": 0.7809287905693054, "learning_rate": 1.0929314440977323e-05, "loss": 0.2426, "num_input_tokens_seen": 5290312, "step": 27755 }, { "epoch": 14.428274428274428, "grad_norm": 0.14841331541538239, "learning_rate": 1.0919943198042676e-05, "loss": 0.0738, "num_input_tokens_seen": 5291368, "step": 27760 }, { "epoch": 14.430873180873181, "grad_norm": 0.47215795516967773, "learning_rate": 1.0910574851711231e-05, "loss": 0.1136, "num_input_tokens_seen": 5292232, "step": 27765 }, { "epoch": 14.433471933471933, "grad_norm": 0.5095823407173157, "learning_rate": 1.0901209403910298e-05, "loss": 0.1609, "num_input_tokens_seen": 5293224, "step": 27770 }, { "epoch": 14.436070686070686, "grad_norm": 0.10253645479679108, "learning_rate": 1.0891846856566564e-05, "loss": 0.1132, "num_input_tokens_seen": 5294216, "step": 27775 }, { "epoch": 14.43866943866944, "grad_norm": 0.4918402433395386, "learning_rate": 1.0882487211606129e-05, "loss": 0.1833, "num_input_tokens_seen": 5295176, "step": 27780 }, { "epoch": 14.441268191268192, "grad_norm": 1.0579694509506226, "learning_rate": 1.0873130470954487e-05, "loss": 0.1755, "num_input_tokens_seen": 5296168, "step": 27785 }, { "epoch": 14.443866943866944, "grad_norm": 0.10224203765392303, "learning_rate": 1.0863776636536558e-05, "loss": 0.1372, "num_input_tokens_seen": 5297160, "step": 27790 }, { "epoch": 14.446465696465696, "grad_norm": 1.59480619430542, "learning_rate": 1.085442571027664e-05, "loss": 0.1949, "num_input_tokens_seen": 5298056, "step": 27795 }, { "epoch": 14.44906444906445, "grad_norm": 0.8202735781669617, "learning_rate": 1.0845077694098432e-05, "loss": 0.1439, "num_input_tokens_seen": 5298952, "step": 27800 }, { "epoch": 14.451663201663202, "grad_norm": 1.1000994443893433, "learning_rate": 1.0835732589925074e-05, "loss": 0.1874, "num_input_tokens_seen": 5299912, "step": 27805 }, { "epoch": 14.454261954261954, "grad_norm": 0.3057195544242859, "learning_rate": 1.0826390399679045e-05, "loss": 0.1634, "num_input_tokens_seen": 5300904, "step": 27810 }, { "epoch": 14.456860706860708, "grad_norm": 1.757293939590454, "learning_rate": 1.0817051125282263e-05, "loss": 0.127, "num_input_tokens_seen": 5301768, "step": 27815 }, { "epoch": 14.45945945945946, "grad_norm": 0.8303114175796509, "learning_rate": 1.080771476865605e-05, "loss": 0.2314, "num_input_tokens_seen": 5302696, "step": 27820 }, { "epoch": 14.462058212058212, "grad_norm": 0.05244744196534157, "learning_rate": 1.0798381331721109e-05, "loss": 0.124, "num_input_tokens_seen": 5303592, "step": 27825 }, { "epoch": 14.464656964656964, "grad_norm": 0.759273886680603, "learning_rate": 1.0789050816397552e-05, "loss": 0.0893, "num_input_tokens_seen": 5304488, "step": 27830 }, { "epoch": 14.467255717255718, "grad_norm": 0.234585702419281, "learning_rate": 1.0779723224604881e-05, "loss": 0.0731, "num_input_tokens_seen": 5305448, "step": 27835 }, { "epoch": 14.46985446985447, "grad_norm": 0.4851650893688202, "learning_rate": 1.0770398558262021e-05, "loss": 0.0691, "num_input_tokens_seen": 5306408, "step": 27840 }, { "epoch": 14.472453222453222, "grad_norm": 0.0710831806063652, "learning_rate": 1.0761076819287269e-05, "loss": 0.078, "num_input_tokens_seen": 5307304, "step": 27845 }, { "epoch": 14.475051975051976, "grad_norm": 0.7512381076812744, "learning_rate": 1.0751758009598328e-05, "loss": 0.2086, "num_input_tokens_seen": 5308232, "step": 27850 }, { "epoch": 14.477650727650728, "grad_norm": 0.1946457326412201, "learning_rate": 1.074244213111229e-05, "loss": 0.0874, "num_input_tokens_seen": 5309160, "step": 27855 }, { "epoch": 14.48024948024948, "grad_norm": 0.12864316999912262, "learning_rate": 1.0733129185745672e-05, "loss": 0.0878, "num_input_tokens_seen": 5310216, "step": 27860 }, { "epoch": 14.482848232848234, "grad_norm": 1.2216496467590332, "learning_rate": 1.072381917541436e-05, "loss": 0.212, "num_input_tokens_seen": 5311336, "step": 27865 }, { "epoch": 14.485446985446986, "grad_norm": 0.47580620646476746, "learning_rate": 1.0714512102033645e-05, "loss": 0.1422, "num_input_tokens_seen": 5312328, "step": 27870 }, { "epoch": 14.488045738045738, "grad_norm": 0.7244770526885986, "learning_rate": 1.070520796751821e-05, "loss": 0.1964, "num_input_tokens_seen": 5313352, "step": 27875 }, { "epoch": 14.49064449064449, "grad_norm": 0.07290297001600266, "learning_rate": 1.0695906773782145e-05, "loss": 0.1188, "num_input_tokens_seen": 5314312, "step": 27880 }, { "epoch": 14.493243243243244, "grad_norm": 0.4667093753814697, "learning_rate": 1.068660852273892e-05, "loss": 0.2295, "num_input_tokens_seen": 5315208, "step": 27885 }, { "epoch": 14.495841995841996, "grad_norm": 0.6587992906570435, "learning_rate": 1.0677313216301402e-05, "loss": 0.1327, "num_input_tokens_seen": 5316072, "step": 27890 }, { "epoch": 14.498440748440748, "grad_norm": 0.16759535670280457, "learning_rate": 1.0668020856381872e-05, "loss": 0.055, "num_input_tokens_seen": 5316968, "step": 27895 }, { "epoch": 14.5010395010395, "grad_norm": 0.1454005092382431, "learning_rate": 1.0658731444891984e-05, "loss": 0.0377, "num_input_tokens_seen": 5317896, "step": 27900 }, { "epoch": 14.503638253638254, "grad_norm": 0.6098172068595886, "learning_rate": 1.064944498374279e-05, "loss": 0.0298, "num_input_tokens_seen": 5318824, "step": 27905 }, { "epoch": 14.506237006237006, "grad_norm": 0.09685872495174408, "learning_rate": 1.0640161474844726e-05, "loss": 0.0654, "num_input_tokens_seen": 5319720, "step": 27910 }, { "epoch": 14.508835758835758, "grad_norm": 0.29957664012908936, "learning_rate": 1.0630880920107649e-05, "loss": 0.1802, "num_input_tokens_seen": 5320744, "step": 27915 }, { "epoch": 14.511434511434512, "grad_norm": 0.4756670594215393, "learning_rate": 1.0621603321440784e-05, "loss": 0.0818, "num_input_tokens_seen": 5321768, "step": 27920 }, { "epoch": 14.514033264033264, "grad_norm": 0.718478262424469, "learning_rate": 1.0612328680752745e-05, "loss": 0.0776, "num_input_tokens_seen": 5322728, "step": 27925 }, { "epoch": 14.516632016632016, "grad_norm": 1.4644179344177246, "learning_rate": 1.060305699995156e-05, "loss": 0.0715, "num_input_tokens_seen": 5323752, "step": 27930 }, { "epoch": 14.51923076923077, "grad_norm": 0.19414377212524414, "learning_rate": 1.059378828094464e-05, "loss": 0.1706, "num_input_tokens_seen": 5324680, "step": 27935 }, { "epoch": 14.521829521829522, "grad_norm": 0.8931695222854614, "learning_rate": 1.0584522525638752e-05, "loss": 0.1075, "num_input_tokens_seen": 5325736, "step": 27940 }, { "epoch": 14.524428274428274, "grad_norm": 0.7318140268325806, "learning_rate": 1.057525973594011e-05, "loss": 0.1056, "num_input_tokens_seen": 5326760, "step": 27945 }, { "epoch": 14.527027027027026, "grad_norm": 0.9427480697631836, "learning_rate": 1.0565999913754282e-05, "loss": 0.2116, "num_input_tokens_seen": 5327688, "step": 27950 }, { "epoch": 14.52962577962578, "grad_norm": 1.109911561012268, "learning_rate": 1.0556743060986233e-05, "loss": 0.1012, "num_input_tokens_seen": 5328616, "step": 27955 }, { "epoch": 14.532224532224532, "grad_norm": 0.8034278750419617, "learning_rate": 1.0547489179540307e-05, "loss": 0.0624, "num_input_tokens_seen": 5329576, "step": 27960 }, { "epoch": 14.534823284823284, "grad_norm": 0.23120559751987457, "learning_rate": 1.0538238271320271e-05, "loss": 0.0627, "num_input_tokens_seen": 5330632, "step": 27965 }, { "epoch": 14.537422037422038, "grad_norm": 0.48148128390312195, "learning_rate": 1.0528990338229248e-05, "loss": 0.074, "num_input_tokens_seen": 5331496, "step": 27970 }, { "epoch": 14.54002079002079, "grad_norm": 0.9335080981254578, "learning_rate": 1.0519745382169751e-05, "loss": 0.2581, "num_input_tokens_seen": 5332456, "step": 27975 }, { "epoch": 14.542619542619542, "grad_norm": 0.6110220551490784, "learning_rate": 1.051050340504369e-05, "loss": 0.1579, "num_input_tokens_seen": 5333416, "step": 27980 }, { "epoch": 14.545218295218294, "grad_norm": 1.2143296003341675, "learning_rate": 1.0501264408752368e-05, "loss": 0.1431, "num_input_tokens_seen": 5334280, "step": 27985 }, { "epoch": 14.547817047817048, "grad_norm": 0.8770349025726318, "learning_rate": 1.0492028395196463e-05, "loss": 0.2098, "num_input_tokens_seen": 5335208, "step": 27990 }, { "epoch": 14.5504158004158, "grad_norm": 0.1342761218547821, "learning_rate": 1.048279536627604e-05, "loss": 0.1549, "num_input_tokens_seen": 5336200, "step": 27995 }, { "epoch": 14.553014553014552, "grad_norm": 0.14431390166282654, "learning_rate": 1.0473565323890556e-05, "loss": 0.0472, "num_input_tokens_seen": 5337160, "step": 28000 }, { "epoch": 14.555613305613306, "grad_norm": 0.7736300826072693, "learning_rate": 1.0464338269938848e-05, "loss": 0.1535, "num_input_tokens_seen": 5338152, "step": 28005 }, { "epoch": 14.558212058212058, "grad_norm": 0.3931065499782562, "learning_rate": 1.0455114206319144e-05, "loss": 0.1888, "num_input_tokens_seen": 5339080, "step": 28010 }, { "epoch": 14.56081081081081, "grad_norm": 1.0799461603164673, "learning_rate": 1.0445893134929042e-05, "loss": 0.1854, "num_input_tokens_seen": 5340040, "step": 28015 }, { "epoch": 14.563409563409563, "grad_norm": 0.2104947715997696, "learning_rate": 1.0436675057665554e-05, "loss": 0.1348, "num_input_tokens_seen": 5340968, "step": 28020 }, { "epoch": 14.566008316008316, "grad_norm": 0.6202840805053711, "learning_rate": 1.0427459976425052e-05, "loss": 0.1728, "num_input_tokens_seen": 5341928, "step": 28025 }, { "epoch": 14.568607068607069, "grad_norm": 0.1811652034521103, "learning_rate": 1.0418247893103298e-05, "loss": 0.0649, "num_input_tokens_seen": 5342888, "step": 28030 }, { "epoch": 14.57120582120582, "grad_norm": 0.3315933048725128, "learning_rate": 1.0409038809595423e-05, "loss": 0.1272, "num_input_tokens_seen": 5343816, "step": 28035 }, { "epoch": 14.573804573804575, "grad_norm": 1.4634320735931396, "learning_rate": 1.0399832727795978e-05, "loss": 0.1407, "num_input_tokens_seen": 5344744, "step": 28040 }, { "epoch": 14.576403326403327, "grad_norm": 0.5655027627944946, "learning_rate": 1.039062964959886e-05, "loss": 0.2243, "num_input_tokens_seen": 5345736, "step": 28045 }, { "epoch": 14.579002079002079, "grad_norm": 0.2916587293148041, "learning_rate": 1.0381429576897355e-05, "loss": 0.0695, "num_input_tokens_seen": 5346696, "step": 28050 }, { "epoch": 14.58160083160083, "grad_norm": 0.43065187335014343, "learning_rate": 1.0372232511584157e-05, "loss": 0.0931, "num_input_tokens_seen": 5347656, "step": 28055 }, { "epoch": 14.584199584199585, "grad_norm": 1.008347988128662, "learning_rate": 1.036303845555131e-05, "loss": 0.2088, "num_input_tokens_seen": 5348584, "step": 28060 }, { "epoch": 14.586798336798337, "grad_norm": 0.27765384316444397, "learning_rate": 1.0353847410690251e-05, "loss": 0.1005, "num_input_tokens_seen": 5349448, "step": 28065 }, { "epoch": 14.589397089397089, "grad_norm": 0.517540454864502, "learning_rate": 1.0344659378891797e-05, "loss": 0.0563, "num_input_tokens_seen": 5350408, "step": 28070 }, { "epoch": 14.591995841995843, "grad_norm": 1.3020920753479004, "learning_rate": 1.0335474362046144e-05, "loss": 0.1377, "num_input_tokens_seen": 5351368, "step": 28075 }, { "epoch": 14.594594594594595, "grad_norm": 0.3220742344856262, "learning_rate": 1.0326292362042872e-05, "loss": 0.0728, "num_input_tokens_seen": 5352296, "step": 28080 }, { "epoch": 14.597193347193347, "grad_norm": 0.3937050402164459, "learning_rate": 1.0317113380770924e-05, "loss": 0.1013, "num_input_tokens_seen": 5353224, "step": 28085 }, { "epoch": 14.5997920997921, "grad_norm": 0.24209339916706085, "learning_rate": 1.0307937420118658e-05, "loss": 0.1816, "num_input_tokens_seen": 5354216, "step": 28090 }, { "epoch": 14.602390852390853, "grad_norm": 2.561350107192993, "learning_rate": 1.0298764481973775e-05, "loss": 0.2895, "num_input_tokens_seen": 5355176, "step": 28095 }, { "epoch": 14.604989604989605, "grad_norm": 0.19229651987552643, "learning_rate": 1.0289594568223365e-05, "loss": 0.0593, "num_input_tokens_seen": 5356200, "step": 28100 }, { "epoch": 14.607588357588357, "grad_norm": 0.22688642144203186, "learning_rate": 1.0280427680753896e-05, "loss": 0.0776, "num_input_tokens_seen": 5357160, "step": 28105 }, { "epoch": 14.61018711018711, "grad_norm": 0.29655721783638, "learning_rate": 1.0271263821451227e-05, "loss": 0.0663, "num_input_tokens_seen": 5358152, "step": 28110 }, { "epoch": 14.612785862785863, "grad_norm": 0.8963977098464966, "learning_rate": 1.0262102992200576e-05, "loss": 0.251, "num_input_tokens_seen": 5359176, "step": 28115 }, { "epoch": 14.615384615384615, "grad_norm": 0.3105325996875763, "learning_rate": 1.0252945194886543e-05, "loss": 0.0771, "num_input_tokens_seen": 5360136, "step": 28120 }, { "epoch": 14.617983367983367, "grad_norm": 1.1056801080703735, "learning_rate": 1.0243790431393097e-05, "loss": 0.1907, "num_input_tokens_seen": 5361064, "step": 28125 }, { "epoch": 14.620582120582121, "grad_norm": 0.2317052185535431, "learning_rate": 1.0234638703603616e-05, "loss": 0.1296, "num_input_tokens_seen": 5361960, "step": 28130 }, { "epoch": 14.623180873180873, "grad_norm": 0.7736721634864807, "learning_rate": 1.0225490013400801e-05, "loss": 0.1506, "num_input_tokens_seen": 5362920, "step": 28135 }, { "epoch": 14.625779625779625, "grad_norm": 0.5300967693328857, "learning_rate": 1.0216344362666761e-05, "loss": 0.0837, "num_input_tokens_seen": 5363848, "step": 28140 }, { "epoch": 14.628378378378379, "grad_norm": 0.4148204028606415, "learning_rate": 1.0207201753282989e-05, "loss": 0.0893, "num_input_tokens_seen": 5364936, "step": 28145 }, { "epoch": 14.630977130977131, "grad_norm": 0.240904301404953, "learning_rate": 1.0198062187130324e-05, "loss": 0.1501, "num_input_tokens_seen": 5365896, "step": 28150 }, { "epoch": 14.633575883575883, "grad_norm": 1.1950379610061646, "learning_rate": 1.0188925666089e-05, "loss": 0.1449, "num_input_tokens_seen": 5366856, "step": 28155 }, { "epoch": 14.636174636174637, "grad_norm": 0.7940139174461365, "learning_rate": 1.0179792192038604e-05, "loss": 0.1383, "num_input_tokens_seen": 5367816, "step": 28160 }, { "epoch": 14.638773388773389, "grad_norm": 0.44848865270614624, "learning_rate": 1.0170661766858129e-05, "loss": 0.0643, "num_input_tokens_seen": 5368808, "step": 28165 }, { "epoch": 14.641372141372141, "grad_norm": 0.34687739610671997, "learning_rate": 1.0161534392425911e-05, "loss": 0.0729, "num_input_tokens_seen": 5369800, "step": 28170 }, { "epoch": 14.643970893970893, "grad_norm": 1.6839172840118408, "learning_rate": 1.0152410070619658e-05, "loss": 0.2754, "num_input_tokens_seen": 5370760, "step": 28175 }, { "epoch": 14.646569646569647, "grad_norm": 0.0781148299574852, "learning_rate": 1.0143288803316483e-05, "loss": 0.1544, "num_input_tokens_seen": 5371752, "step": 28180 }, { "epoch": 14.6491683991684, "grad_norm": 0.5667974352836609, "learning_rate": 1.0134170592392836e-05, "loss": 0.1117, "num_input_tokens_seen": 5372648, "step": 28185 }, { "epoch": 14.651767151767151, "grad_norm": 1.3159761428833008, "learning_rate": 1.0125055439724548e-05, "loss": 0.139, "num_input_tokens_seen": 5373672, "step": 28190 }, { "epoch": 14.654365904365905, "grad_norm": 0.8602209687232971, "learning_rate": 1.0115943347186826e-05, "loss": 0.1179, "num_input_tokens_seen": 5374632, "step": 28195 }, { "epoch": 14.656964656964657, "grad_norm": 0.4631529748439789, "learning_rate": 1.0106834316654246e-05, "loss": 0.1532, "num_input_tokens_seen": 5375560, "step": 28200 }, { "epoch": 14.65956340956341, "grad_norm": 0.13607937097549438, "learning_rate": 1.009772835000075e-05, "loss": 0.0527, "num_input_tokens_seen": 5376520, "step": 28205 }, { "epoch": 14.662162162162161, "grad_norm": 1.0070353746414185, "learning_rate": 1.0088625449099642e-05, "loss": 0.1514, "num_input_tokens_seen": 5377384, "step": 28210 }, { "epoch": 14.664760914760915, "grad_norm": 0.2087157517671585, "learning_rate": 1.007952561582363e-05, "loss": 0.0693, "num_input_tokens_seen": 5378280, "step": 28215 }, { "epoch": 14.667359667359667, "grad_norm": 0.7429159283638, "learning_rate": 1.0070428852044747e-05, "loss": 0.2399, "num_input_tokens_seen": 5379240, "step": 28220 }, { "epoch": 14.66995841995842, "grad_norm": 0.5944414734840393, "learning_rate": 1.0061335159634425e-05, "loss": 0.0927, "num_input_tokens_seen": 5380264, "step": 28225 }, { "epoch": 14.672557172557173, "grad_norm": 0.13066473603248596, "learning_rate": 1.005224454046344e-05, "loss": 0.127, "num_input_tokens_seen": 5381224, "step": 28230 }, { "epoch": 14.675155925155925, "grad_norm": 0.666598379611969, "learning_rate": 1.0043156996401965e-05, "loss": 0.1987, "num_input_tokens_seen": 5382184, "step": 28235 }, { "epoch": 14.677754677754677, "grad_norm": 0.7915512919425964, "learning_rate": 1.0034072529319516e-05, "loss": 0.1192, "num_input_tokens_seen": 5383144, "step": 28240 }, { "epoch": 14.68035343035343, "grad_norm": 0.35813966393470764, "learning_rate": 1.0024991141084986e-05, "loss": 0.217, "num_input_tokens_seen": 5384072, "step": 28245 }, { "epoch": 14.682952182952183, "grad_norm": 0.10784602165222168, "learning_rate": 1.0015912833566624e-05, "loss": 0.0869, "num_input_tokens_seen": 5385032, "step": 28250 }, { "epoch": 14.685550935550935, "grad_norm": 0.35113856196403503, "learning_rate": 1.0006837608632072e-05, "loss": 0.1414, "num_input_tokens_seen": 5385992, "step": 28255 }, { "epoch": 14.688149688149688, "grad_norm": 0.8435274958610535, "learning_rate": 9.99776546814831e-06, "loss": 0.1449, "num_input_tokens_seen": 5386984, "step": 28260 }, { "epoch": 14.690748440748441, "grad_norm": 1.2408567667007446, "learning_rate": 9.988696413981694e-06, "loss": 0.0996, "num_input_tokens_seen": 5387912, "step": 28265 }, { "epoch": 14.693347193347194, "grad_norm": 0.9532626867294312, "learning_rate": 9.979630447997942e-06, "loss": 0.1585, "num_input_tokens_seen": 5388872, "step": 28270 }, { "epoch": 14.695945945945946, "grad_norm": 0.09320659190416336, "learning_rate": 9.970567572062145e-06, "loss": 0.022, "num_input_tokens_seen": 5389768, "step": 28275 }, { "epoch": 14.698544698544698, "grad_norm": 0.6216936111450195, "learning_rate": 9.96150778803875e-06, "loss": 0.1178, "num_input_tokens_seen": 5390728, "step": 28280 }, { "epoch": 14.701143451143452, "grad_norm": 0.7542116045951843, "learning_rate": 9.952451097791557e-06, "loss": 0.2048, "num_input_tokens_seen": 5391624, "step": 28285 }, { "epoch": 14.703742203742204, "grad_norm": 2.2857918739318848, "learning_rate": 9.943397503183768e-06, "loss": 0.1312, "num_input_tokens_seen": 5392552, "step": 28290 }, { "epoch": 14.706340956340956, "grad_norm": 0.2666369676589966, "learning_rate": 9.93434700607791e-06, "loss": 0.1016, "num_input_tokens_seen": 5393544, "step": 28295 }, { "epoch": 14.70893970893971, "grad_norm": 0.49618175625801086, "learning_rate": 9.925299608335878e-06, "loss": 0.1321, "num_input_tokens_seen": 5394472, "step": 28300 }, { "epoch": 14.711538461538462, "grad_norm": 0.2659633159637451, "learning_rate": 9.916255311818956e-06, "loss": 0.1716, "num_input_tokens_seen": 5395368, "step": 28305 }, { "epoch": 14.714137214137214, "grad_norm": 0.9848700761795044, "learning_rate": 9.907214118387761e-06, "loss": 0.063, "num_input_tokens_seen": 5396328, "step": 28310 }, { "epoch": 14.716735966735968, "grad_norm": 0.31669461727142334, "learning_rate": 9.898176029902281e-06, "loss": 0.1395, "num_input_tokens_seen": 5397384, "step": 28315 }, { "epoch": 14.71933471933472, "grad_norm": 0.07408683747053146, "learning_rate": 9.889141048221862e-06, "loss": 0.0342, "num_input_tokens_seen": 5398280, "step": 28320 }, { "epoch": 14.721933471933472, "grad_norm": 0.9800271391868591, "learning_rate": 9.880109175205226e-06, "loss": 0.1712, "num_input_tokens_seen": 5399272, "step": 28325 }, { "epoch": 14.724532224532224, "grad_norm": 0.9865177869796753, "learning_rate": 9.87108041271045e-06, "loss": 0.1033, "num_input_tokens_seen": 5400232, "step": 28330 }, { "epoch": 14.727130977130978, "grad_norm": 0.097950279712677, "learning_rate": 9.862054762594936e-06, "loss": 0.094, "num_input_tokens_seen": 5401128, "step": 28335 }, { "epoch": 14.72972972972973, "grad_norm": 0.40611472725868225, "learning_rate": 9.853032226715505e-06, "loss": 0.042, "num_input_tokens_seen": 5402024, "step": 28340 }, { "epoch": 14.732328482328482, "grad_norm": 1.0754833221435547, "learning_rate": 9.844012806928296e-06, "loss": 0.1282, "num_input_tokens_seen": 5403016, "step": 28345 }, { "epoch": 14.734927234927234, "grad_norm": 0.5089306831359863, "learning_rate": 9.834996505088817e-06, "loss": 0.1542, "num_input_tokens_seen": 5404104, "step": 28350 }, { "epoch": 14.737525987525988, "grad_norm": 0.11892211437225342, "learning_rate": 9.825983323051933e-06, "loss": 0.0536, "num_input_tokens_seen": 5405032, "step": 28355 }, { "epoch": 14.74012474012474, "grad_norm": 0.14429891109466553, "learning_rate": 9.816973262671883e-06, "loss": 0.1405, "num_input_tokens_seen": 5405992, "step": 28360 }, { "epoch": 14.742723492723492, "grad_norm": 1.3609801530838013, "learning_rate": 9.807966325802243e-06, "loss": 0.1259, "num_input_tokens_seen": 5406952, "step": 28365 }, { "epoch": 14.745322245322246, "grad_norm": 0.5102342963218689, "learning_rate": 9.798962514295956e-06, "loss": 0.0606, "num_input_tokens_seen": 5407944, "step": 28370 }, { "epoch": 14.747920997920998, "grad_norm": 0.6540773510932922, "learning_rate": 9.789961830005313e-06, "loss": 0.1129, "num_input_tokens_seen": 5408872, "step": 28375 }, { "epoch": 14.75051975051975, "grad_norm": 0.7474220991134644, "learning_rate": 9.780964274781984e-06, "loss": 0.1698, "num_input_tokens_seen": 5409832, "step": 28380 }, { "epoch": 14.753118503118504, "grad_norm": 0.9111139178276062, "learning_rate": 9.771969850476972e-06, "loss": 0.1087, "num_input_tokens_seen": 5410696, "step": 28385 }, { "epoch": 14.755717255717256, "grad_norm": 0.4738064110279083, "learning_rate": 9.762978558940648e-06, "loss": 0.0997, "num_input_tokens_seen": 5411624, "step": 28390 }, { "epoch": 14.758316008316008, "grad_norm": 0.303739994764328, "learning_rate": 9.753990402022727e-06, "loss": 0.1045, "num_input_tokens_seen": 5412584, "step": 28395 }, { "epoch": 14.76091476091476, "grad_norm": 1.09861421585083, "learning_rate": 9.745005381572298e-06, "loss": 0.1038, "num_input_tokens_seen": 5413512, "step": 28400 }, { "epoch": 14.763513513513514, "grad_norm": 1.3734475374221802, "learning_rate": 9.736023499437786e-06, "loss": 0.213, "num_input_tokens_seen": 5414408, "step": 28405 }, { "epoch": 14.766112266112266, "grad_norm": 0.20790857076644897, "learning_rate": 9.727044757466969e-06, "loss": 0.0767, "num_input_tokens_seen": 5415400, "step": 28410 }, { "epoch": 14.768711018711018, "grad_norm": 0.09145370870828629, "learning_rate": 9.71806915750701e-06, "loss": 0.1258, "num_input_tokens_seen": 5416296, "step": 28415 }, { "epoch": 14.771309771309772, "grad_norm": 0.5238674879074097, "learning_rate": 9.709096701404394e-06, "loss": 0.0751, "num_input_tokens_seen": 5417192, "step": 28420 }, { "epoch": 14.773908523908524, "grad_norm": 0.6653363108634949, "learning_rate": 9.70012739100496e-06, "loss": 0.0961, "num_input_tokens_seen": 5418120, "step": 28425 }, { "epoch": 14.776507276507276, "grad_norm": 1.0539971590042114, "learning_rate": 9.691161228153925e-06, "loss": 0.1567, "num_input_tokens_seen": 5419144, "step": 28430 }, { "epoch": 14.779106029106028, "grad_norm": 0.9435888528823853, "learning_rate": 9.682198214695833e-06, "loss": 0.1124, "num_input_tokens_seen": 5420104, "step": 28435 }, { "epoch": 14.781704781704782, "grad_norm": 0.3570348620414734, "learning_rate": 9.673238352474592e-06, "loss": 0.087, "num_input_tokens_seen": 5421032, "step": 28440 }, { "epoch": 14.784303534303534, "grad_norm": 0.2553694248199463, "learning_rate": 9.664281643333451e-06, "loss": 0.0894, "num_input_tokens_seen": 5421992, "step": 28445 }, { "epoch": 14.786902286902286, "grad_norm": 0.42371803522109985, "learning_rate": 9.655328089115034e-06, "loss": 0.1119, "num_input_tokens_seen": 5423016, "step": 28450 }, { "epoch": 14.78950103950104, "grad_norm": 0.7191260457038879, "learning_rate": 9.64637769166129e-06, "loss": 0.1153, "num_input_tokens_seen": 5423944, "step": 28455 }, { "epoch": 14.792099792099792, "grad_norm": 0.1618102490901947, "learning_rate": 9.637430452813535e-06, "loss": 0.0673, "num_input_tokens_seen": 5424936, "step": 28460 }, { "epoch": 14.794698544698544, "grad_norm": 0.1513570100069046, "learning_rate": 9.628486374412421e-06, "loss": 0.1186, "num_input_tokens_seen": 5425864, "step": 28465 }, { "epoch": 14.797297297297296, "grad_norm": 0.9883790612220764, "learning_rate": 9.619545458297965e-06, "loss": 0.0952, "num_input_tokens_seen": 5426760, "step": 28470 }, { "epoch": 14.79989604989605, "grad_norm": 0.04943476989865303, "learning_rate": 9.610607706309524e-06, "loss": 0.1111, "num_input_tokens_seen": 5427752, "step": 28475 }, { "epoch": 14.802494802494802, "grad_norm": 1.1327717304229736, "learning_rate": 9.601673120285798e-06, "loss": 0.1134, "num_input_tokens_seen": 5428712, "step": 28480 }, { "epoch": 14.805093555093555, "grad_norm": 0.24476690590381622, "learning_rate": 9.592741702064859e-06, "loss": 0.0255, "num_input_tokens_seen": 5429640, "step": 28485 }, { "epoch": 14.807692307692308, "grad_norm": 0.2880100607872009, "learning_rate": 9.583813453484108e-06, "loss": 0.1152, "num_input_tokens_seen": 5430568, "step": 28490 }, { "epoch": 14.81029106029106, "grad_norm": 0.19522182643413544, "learning_rate": 9.574888376380297e-06, "loss": 0.0718, "num_input_tokens_seen": 5431528, "step": 28495 }, { "epoch": 14.812889812889813, "grad_norm": 0.33632567524909973, "learning_rate": 9.565966472589519e-06, "loss": 0.1217, "num_input_tokens_seen": 5432488, "step": 28500 }, { "epoch": 14.815488565488565, "grad_norm": 0.0460270531475544, "learning_rate": 9.557047743947239e-06, "loss": 0.1768, "num_input_tokens_seen": 5433416, "step": 28505 }, { "epoch": 14.818087318087318, "grad_norm": 0.033102720975875854, "learning_rate": 9.548132192288243e-06, "loss": 0.0569, "num_input_tokens_seen": 5434376, "step": 28510 }, { "epoch": 14.82068607068607, "grad_norm": 0.47811344265937805, "learning_rate": 9.53921981944667e-06, "loss": 0.1612, "num_input_tokens_seen": 5435368, "step": 28515 }, { "epoch": 14.823284823284823, "grad_norm": 0.130048930644989, "learning_rate": 9.530310627256005e-06, "loss": 0.1251, "num_input_tokens_seen": 5436360, "step": 28520 }, { "epoch": 14.825883575883577, "grad_norm": 0.22780479490756989, "learning_rate": 9.521404617549102e-06, "loss": 0.1162, "num_input_tokens_seen": 5437384, "step": 28525 }, { "epoch": 14.828482328482329, "grad_norm": 1.2095500230789185, "learning_rate": 9.512501792158116e-06, "loss": 0.1821, "num_input_tokens_seen": 5438376, "step": 28530 }, { "epoch": 14.83108108108108, "grad_norm": 0.8467844128608704, "learning_rate": 9.503602152914567e-06, "loss": 0.0441, "num_input_tokens_seen": 5439368, "step": 28535 }, { "epoch": 14.833679833679835, "grad_norm": 1.0016945600509644, "learning_rate": 9.494705701649345e-06, "loss": 0.0927, "num_input_tokens_seen": 5440264, "step": 28540 }, { "epoch": 14.836278586278587, "grad_norm": 1.183125376701355, "learning_rate": 9.48581244019265e-06, "loss": 0.3591, "num_input_tokens_seen": 5441192, "step": 28545 }, { "epoch": 14.838877338877339, "grad_norm": 1.2425150871276855, "learning_rate": 9.476922370374027e-06, "loss": 0.2455, "num_input_tokens_seen": 5442088, "step": 28550 }, { "epoch": 14.84147609147609, "grad_norm": 0.29177308082580566, "learning_rate": 9.4680354940224e-06, "loss": 0.0911, "num_input_tokens_seen": 5443048, "step": 28555 }, { "epoch": 14.844074844074845, "grad_norm": 1.5563281774520874, "learning_rate": 9.459151812965997e-06, "loss": 0.1222, "num_input_tokens_seen": 5443976, "step": 28560 }, { "epoch": 14.846673596673597, "grad_norm": 0.4049864709377289, "learning_rate": 9.450271329032404e-06, "loss": 0.1731, "num_input_tokens_seen": 5444904, "step": 28565 }, { "epoch": 14.849272349272349, "grad_norm": 0.2948578894138336, "learning_rate": 9.441394044048538e-06, "loss": 0.0392, "num_input_tokens_seen": 5445832, "step": 28570 }, { "epoch": 14.851871101871101, "grad_norm": 0.3240148425102234, "learning_rate": 9.432519959840689e-06, "loss": 0.0455, "num_input_tokens_seen": 5446792, "step": 28575 }, { "epoch": 14.854469854469855, "grad_norm": 1.0032533407211304, "learning_rate": 9.423649078234453e-06, "loss": 0.1528, "num_input_tokens_seen": 5447720, "step": 28580 }, { "epoch": 14.857068607068607, "grad_norm": 0.8505225777626038, "learning_rate": 9.41478140105479e-06, "loss": 0.1918, "num_input_tokens_seen": 5448744, "step": 28585 }, { "epoch": 14.859667359667359, "grad_norm": 0.6091525554656982, "learning_rate": 9.405916930125985e-06, "loss": 0.1423, "num_input_tokens_seen": 5449608, "step": 28590 }, { "epoch": 14.862266112266113, "grad_norm": 0.07331571727991104, "learning_rate": 9.397055667271675e-06, "loss": 0.0702, "num_input_tokens_seen": 5450536, "step": 28595 }, { "epoch": 14.864864864864865, "grad_norm": 0.20589697360992432, "learning_rate": 9.388197614314834e-06, "loss": 0.0918, "num_input_tokens_seen": 5451528, "step": 28600 }, { "epoch": 14.867463617463617, "grad_norm": 1.1180570125579834, "learning_rate": 9.379342773077759e-06, "loss": 0.2549, "num_input_tokens_seen": 5452456, "step": 28605 }, { "epoch": 14.87006237006237, "grad_norm": 0.9790568351745605, "learning_rate": 9.37049114538213e-06, "loss": 0.1557, "num_input_tokens_seen": 5453384, "step": 28610 }, { "epoch": 14.872661122661123, "grad_norm": 0.07802839577198029, "learning_rate": 9.36164273304892e-06, "loss": 0.1329, "num_input_tokens_seen": 5454376, "step": 28615 }, { "epoch": 14.875259875259875, "grad_norm": 1.5445129871368408, "learning_rate": 9.352797537898461e-06, "loss": 0.1622, "num_input_tokens_seen": 5455304, "step": 28620 }, { "epoch": 14.877858627858627, "grad_norm": 0.5261105298995972, "learning_rate": 9.343955561750412e-06, "loss": 0.0775, "num_input_tokens_seen": 5456296, "step": 28625 }, { "epoch": 14.880457380457381, "grad_norm": 1.4153673648834229, "learning_rate": 9.335116806423796e-06, "loss": 0.1355, "num_input_tokens_seen": 5457224, "step": 28630 }, { "epoch": 14.883056133056133, "grad_norm": 1.3634065389633179, "learning_rate": 9.326281273736948e-06, "loss": 0.2969, "num_input_tokens_seen": 5458152, "step": 28635 }, { "epoch": 14.885654885654885, "grad_norm": 0.6405664086341858, "learning_rate": 9.317448965507547e-06, "loss": 0.0916, "num_input_tokens_seen": 5459080, "step": 28640 }, { "epoch": 14.888253638253639, "grad_norm": 0.7379133105278015, "learning_rate": 9.308619883552596e-06, "loss": 0.0793, "num_input_tokens_seen": 5460168, "step": 28645 }, { "epoch": 14.890852390852391, "grad_norm": 0.9393922090530396, "learning_rate": 9.299794029688471e-06, "loss": 0.1425, "num_input_tokens_seen": 5461064, "step": 28650 }, { "epoch": 14.893451143451143, "grad_norm": 1.0799890756607056, "learning_rate": 9.290971405730859e-06, "loss": 0.1251, "num_input_tokens_seen": 5462024, "step": 28655 }, { "epoch": 14.896049896049895, "grad_norm": 0.9202175736427307, "learning_rate": 9.282152013494755e-06, "loss": 0.342, "num_input_tokens_seen": 5463048, "step": 28660 }, { "epoch": 14.89864864864865, "grad_norm": 0.15243473649024963, "learning_rate": 9.273335854794548e-06, "loss": 0.1795, "num_input_tokens_seen": 5464008, "step": 28665 }, { "epoch": 14.901247401247401, "grad_norm": 0.1387922614812851, "learning_rate": 9.264522931443922e-06, "loss": 0.1635, "num_input_tokens_seen": 5464936, "step": 28670 }, { "epoch": 14.903846153846153, "grad_norm": 0.9260664582252502, "learning_rate": 9.255713245255895e-06, "loss": 0.1238, "num_input_tokens_seen": 5465768, "step": 28675 }, { "epoch": 14.906444906444907, "grad_norm": 1.0703625679016113, "learning_rate": 9.246906798042848e-06, "loss": 0.1125, "num_input_tokens_seen": 5466760, "step": 28680 }, { "epoch": 14.90904365904366, "grad_norm": 0.9760655760765076, "learning_rate": 9.238103591616471e-06, "loss": 0.1131, "num_input_tokens_seen": 5467752, "step": 28685 }, { "epoch": 14.911642411642411, "grad_norm": 0.2699984908103943, "learning_rate": 9.22930362778779e-06, "loss": 0.1652, "num_input_tokens_seen": 5468648, "step": 28690 }, { "epoch": 14.914241164241163, "grad_norm": 0.6010375022888184, "learning_rate": 9.220506908367154e-06, "loss": 0.12, "num_input_tokens_seen": 5469608, "step": 28695 }, { "epoch": 14.916839916839917, "grad_norm": 0.5217941403388977, "learning_rate": 9.211713435164287e-06, "loss": 0.0894, "num_input_tokens_seen": 5470632, "step": 28700 }, { "epoch": 14.91943866943867, "grad_norm": 1.476530909538269, "learning_rate": 9.202923209988198e-06, "loss": 0.1618, "num_input_tokens_seen": 5471560, "step": 28705 }, { "epoch": 14.922037422037421, "grad_norm": 0.23461033403873444, "learning_rate": 9.19413623464725e-06, "loss": 0.0756, "num_input_tokens_seen": 5472584, "step": 28710 }, { "epoch": 14.924636174636175, "grad_norm": 0.5212222337722778, "learning_rate": 9.185352510949119e-06, "loss": 0.1256, "num_input_tokens_seen": 5473576, "step": 28715 }, { "epoch": 14.927234927234927, "grad_norm": 1.3718807697296143, "learning_rate": 9.17657204070086e-06, "loss": 0.2172, "num_input_tokens_seen": 5474536, "step": 28720 }, { "epoch": 14.92983367983368, "grad_norm": 0.2816154956817627, "learning_rate": 9.167794825708795e-06, "loss": 0.0511, "num_input_tokens_seen": 5475496, "step": 28725 }, { "epoch": 14.932432432432432, "grad_norm": 2.389913320541382, "learning_rate": 9.159020867778607e-06, "loss": 0.1676, "num_input_tokens_seen": 5476456, "step": 28730 }, { "epoch": 14.935031185031185, "grad_norm": 0.5071040987968445, "learning_rate": 9.150250168715324e-06, "loss": 0.118, "num_input_tokens_seen": 5477416, "step": 28735 }, { "epoch": 14.937629937629938, "grad_norm": 1.0257142782211304, "learning_rate": 9.14148273032328e-06, "loss": 0.1407, "num_input_tokens_seen": 5478408, "step": 28740 }, { "epoch": 14.94022869022869, "grad_norm": 0.8808287382125854, "learning_rate": 9.132718554406146e-06, "loss": 0.0952, "num_input_tokens_seen": 5479400, "step": 28745 }, { "epoch": 14.942827442827443, "grad_norm": 0.7748234868049622, "learning_rate": 9.123957642766917e-06, "loss": 0.1224, "num_input_tokens_seen": 5480328, "step": 28750 }, { "epoch": 14.945426195426196, "grad_norm": 0.31771790981292725, "learning_rate": 9.115199997207932e-06, "loss": 0.1371, "num_input_tokens_seen": 5481192, "step": 28755 }, { "epoch": 14.948024948024948, "grad_norm": 0.40874573588371277, "learning_rate": 9.106445619530842e-06, "loss": 0.1168, "num_input_tokens_seen": 5482120, "step": 28760 }, { "epoch": 14.950623700623701, "grad_norm": 0.9069126844406128, "learning_rate": 9.097694511536634e-06, "loss": 0.1136, "num_input_tokens_seen": 5483080, "step": 28765 }, { "epoch": 14.953222453222454, "grad_norm": 0.5805141925811768, "learning_rate": 9.088946675025605e-06, "loss": 0.0884, "num_input_tokens_seen": 5484008, "step": 28770 }, { "epoch": 14.955821205821206, "grad_norm": 1.1372758150100708, "learning_rate": 9.080202111797414e-06, "loss": 0.1137, "num_input_tokens_seen": 5485000, "step": 28775 }, { "epoch": 14.958419958419958, "grad_norm": 0.9891412854194641, "learning_rate": 9.071460823651018e-06, "loss": 0.1237, "num_input_tokens_seen": 5485992, "step": 28780 }, { "epoch": 14.961018711018712, "grad_norm": 0.19026592373847961, "learning_rate": 9.062722812384704e-06, "loss": 0.0589, "num_input_tokens_seen": 5486888, "step": 28785 }, { "epoch": 14.963617463617464, "grad_norm": 0.8076252341270447, "learning_rate": 9.053988079796096e-06, "loss": 0.152, "num_input_tokens_seen": 5487912, "step": 28790 }, { "epoch": 14.966216216216216, "grad_norm": 0.8920058608055115, "learning_rate": 9.045256627682133e-06, "loss": 0.1106, "num_input_tokens_seen": 5488808, "step": 28795 }, { "epoch": 14.96881496881497, "grad_norm": 0.5888583064079285, "learning_rate": 9.036528457839073e-06, "loss": 0.1265, "num_input_tokens_seen": 5489800, "step": 28800 }, { "epoch": 14.971413721413722, "grad_norm": 0.32218900322914124, "learning_rate": 9.027803572062527e-06, "loss": 0.128, "num_input_tokens_seen": 5490696, "step": 28805 }, { "epoch": 14.974012474012474, "grad_norm": 0.42726969718933105, "learning_rate": 9.0190819721474e-06, "loss": 0.155, "num_input_tokens_seen": 5491816, "step": 28810 }, { "epoch": 14.976611226611226, "grad_norm": 0.14838793873786926, "learning_rate": 9.010363659887941e-06, "loss": 0.1325, "num_input_tokens_seen": 5492840, "step": 28815 }, { "epoch": 14.97920997920998, "grad_norm": 0.23340217769145966, "learning_rate": 9.001648637077698e-06, "loss": 0.1388, "num_input_tokens_seen": 5493800, "step": 28820 }, { "epoch": 14.981808731808732, "grad_norm": 0.2081766277551651, "learning_rate": 8.99293690550958e-06, "loss": 0.0721, "num_input_tokens_seen": 5494728, "step": 28825 }, { "epoch": 14.984407484407484, "grad_norm": 1.008272647857666, "learning_rate": 8.984228466975789e-06, "loss": 0.2642, "num_input_tokens_seen": 5495688, "step": 28830 }, { "epoch": 14.987006237006238, "grad_norm": 0.5570778846740723, "learning_rate": 8.975523323267856e-06, "loss": 0.1522, "num_input_tokens_seen": 5496648, "step": 28835 }, { "epoch": 14.98960498960499, "grad_norm": 1.9340590238571167, "learning_rate": 8.966821476176632e-06, "loss": 0.2208, "num_input_tokens_seen": 5497576, "step": 28840 }, { "epoch": 14.992203742203742, "grad_norm": 1.3312320709228516, "learning_rate": 8.958122927492307e-06, "loss": 0.2258, "num_input_tokens_seen": 5498600, "step": 28845 }, { "epoch": 14.994802494802494, "grad_norm": 0.17569324374198914, "learning_rate": 8.949427679004385e-06, "loss": 0.0193, "num_input_tokens_seen": 5499496, "step": 28850 }, { "epoch": 14.997401247401248, "grad_norm": 2.306987762451172, "learning_rate": 8.940735732501655e-06, "loss": 0.2505, "num_input_tokens_seen": 5500520, "step": 28855 }, { "epoch": 15.0, "grad_norm": 1.7640697956085205, "learning_rate": 8.932047089772283e-06, "loss": 0.141, "num_input_tokens_seen": 5501408, "step": 28860 }, { "epoch": 15.0, "eval_loss": 0.14678211510181427, "eval_runtime": 8.0302, "eval_samples_per_second": 106.597, "eval_steps_per_second": 26.649, "num_input_tokens_seen": 5501408, "step": 28860 }, { "epoch": 15.002598752598752, "grad_norm": 0.7408189177513123, "learning_rate": 8.923361752603725e-06, "loss": 0.1358, "num_input_tokens_seen": 5502368, "step": 28865 }, { "epoch": 15.005197505197506, "grad_norm": 0.2837170660495758, "learning_rate": 8.914679722782762e-06, "loss": 0.2281, "num_input_tokens_seen": 5503392, "step": 28870 }, { "epoch": 15.007796257796258, "grad_norm": 0.7518543601036072, "learning_rate": 8.906001002095482e-06, "loss": 0.1596, "num_input_tokens_seen": 5504352, "step": 28875 }, { "epoch": 15.01039501039501, "grad_norm": 0.8952116370201111, "learning_rate": 8.897325592327327e-06, "loss": 0.0531, "num_input_tokens_seen": 5505248, "step": 28880 }, { "epoch": 15.012993762993762, "grad_norm": 0.8013349771499634, "learning_rate": 8.888653495263019e-06, "loss": 0.0617, "num_input_tokens_seen": 5506144, "step": 28885 }, { "epoch": 15.015592515592516, "grad_norm": 1.329559564590454, "learning_rate": 8.879984712686623e-06, "loss": 0.1991, "num_input_tokens_seen": 5507104, "step": 28890 }, { "epoch": 15.018191268191268, "grad_norm": 0.8830448389053345, "learning_rate": 8.871319246381504e-06, "loss": 0.1671, "num_input_tokens_seen": 5508000, "step": 28895 }, { "epoch": 15.02079002079002, "grad_norm": 0.8076056241989136, "learning_rate": 8.862657098130368e-06, "loss": 0.1298, "num_input_tokens_seen": 5508928, "step": 28900 }, { "epoch": 15.023388773388774, "grad_norm": 0.481556236743927, "learning_rate": 8.85399826971522e-06, "loss": 0.1078, "num_input_tokens_seen": 5509952, "step": 28905 }, { "epoch": 15.025987525987526, "grad_norm": 0.2110888808965683, "learning_rate": 8.845342762917378e-06, "loss": 0.1345, "num_input_tokens_seen": 5510944, "step": 28910 }, { "epoch": 15.028586278586278, "grad_norm": 0.9173693060874939, "learning_rate": 8.836690579517512e-06, "loss": 0.192, "num_input_tokens_seen": 5511968, "step": 28915 }, { "epoch": 15.03118503118503, "grad_norm": 0.23964539170265198, "learning_rate": 8.828041721295558e-06, "loss": 0.0752, "num_input_tokens_seen": 5512960, "step": 28920 }, { "epoch": 15.033783783783784, "grad_norm": 0.2991921901702881, "learning_rate": 8.819396190030788e-06, "loss": 0.0379, "num_input_tokens_seen": 5513920, "step": 28925 }, { "epoch": 15.036382536382536, "grad_norm": 0.1831207424402237, "learning_rate": 8.810753987501813e-06, "loss": 0.1659, "num_input_tokens_seen": 5514848, "step": 28930 }, { "epoch": 15.038981288981288, "grad_norm": 1.1469157934188843, "learning_rate": 8.802115115486535e-06, "loss": 0.1259, "num_input_tokens_seen": 5515840, "step": 28935 }, { "epoch": 15.041580041580042, "grad_norm": 1.147318959236145, "learning_rate": 8.79347957576217e-06, "loss": 0.1493, "num_input_tokens_seen": 5516864, "step": 28940 }, { "epoch": 15.044178794178794, "grad_norm": 0.1437487155199051, "learning_rate": 8.78484737010525e-06, "loss": 0.1897, "num_input_tokens_seen": 5517888, "step": 28945 }, { "epoch": 15.046777546777546, "grad_norm": 1.1981396675109863, "learning_rate": 8.776218500291642e-06, "loss": 0.083, "num_input_tokens_seen": 5518848, "step": 28950 }, { "epoch": 15.049376299376299, "grad_norm": 0.1209501102566719, "learning_rate": 8.7675929680965e-06, "loss": 0.1223, "num_input_tokens_seen": 5519808, "step": 28955 }, { "epoch": 15.051975051975052, "grad_norm": 1.1321603059768677, "learning_rate": 8.758970775294301e-06, "loss": 0.0669, "num_input_tokens_seen": 5520832, "step": 28960 }, { "epoch": 15.054573804573804, "grad_norm": 0.08526073396205902, "learning_rate": 8.750351923658832e-06, "loss": 0.1516, "num_input_tokens_seen": 5521728, "step": 28965 }, { "epoch": 15.057172557172557, "grad_norm": 0.3361267149448395, "learning_rate": 8.741736414963206e-06, "loss": 0.1863, "num_input_tokens_seen": 5522752, "step": 28970 }, { "epoch": 15.05977130977131, "grad_norm": 1.5677992105484009, "learning_rate": 8.733124250979832e-06, "loss": 0.0827, "num_input_tokens_seen": 5523648, "step": 28975 }, { "epoch": 15.062370062370062, "grad_norm": 0.48314332962036133, "learning_rate": 8.724515433480443e-06, "loss": 0.0903, "num_input_tokens_seen": 5524640, "step": 28980 }, { "epoch": 15.064968814968815, "grad_norm": 0.14673180878162384, "learning_rate": 8.715909964236071e-06, "loss": 0.0968, "num_input_tokens_seen": 5525600, "step": 28985 }, { "epoch": 15.067567567567568, "grad_norm": 1.184301495552063, "learning_rate": 8.707307845017068e-06, "loss": 0.1044, "num_input_tokens_seen": 5526528, "step": 28990 }, { "epoch": 15.07016632016632, "grad_norm": 0.61293625831604, "learning_rate": 8.698709077593098e-06, "loss": 0.2009, "num_input_tokens_seen": 5527456, "step": 28995 }, { "epoch": 15.072765072765073, "grad_norm": 0.7282804250717163, "learning_rate": 8.69011366373312e-06, "loss": 0.167, "num_input_tokens_seen": 5528448, "step": 29000 }, { "epoch": 15.075363825363825, "grad_norm": 0.354067325592041, "learning_rate": 8.681521605205434e-06, "loss": 0.0787, "num_input_tokens_seen": 5529344, "step": 29005 }, { "epoch": 15.077962577962579, "grad_norm": 0.6231356859207153, "learning_rate": 8.672932903777625e-06, "loss": 0.2303, "num_input_tokens_seen": 5530272, "step": 29010 }, { "epoch": 15.08056133056133, "grad_norm": 0.6584720015525818, "learning_rate": 8.664347561216588e-06, "loss": 0.1598, "num_input_tokens_seen": 5531232, "step": 29015 }, { "epoch": 15.083160083160083, "grad_norm": 0.28014010190963745, "learning_rate": 8.65576557928853e-06, "loss": 0.1391, "num_input_tokens_seen": 5532192, "step": 29020 }, { "epoch": 15.085758835758837, "grad_norm": 0.22487583756446838, "learning_rate": 8.647186959758983e-06, "loss": 0.0627, "num_input_tokens_seen": 5533152, "step": 29025 }, { "epoch": 15.088357588357589, "grad_norm": 0.5823853611946106, "learning_rate": 8.638611704392762e-06, "loss": 0.0517, "num_input_tokens_seen": 5534144, "step": 29030 }, { "epoch": 15.09095634095634, "grad_norm": 0.6378002166748047, "learning_rate": 8.630039814953999e-06, "loss": 0.0957, "num_input_tokens_seen": 5535072, "step": 29035 }, { "epoch": 15.093555093555093, "grad_norm": 0.2915542423725128, "learning_rate": 8.621471293206148e-06, "loss": 0.0621, "num_input_tokens_seen": 5535968, "step": 29040 }, { "epoch": 15.096153846153847, "grad_norm": 0.8086419701576233, "learning_rate": 8.612906140911964e-06, "loss": 0.2144, "num_input_tokens_seen": 5536928, "step": 29045 }, { "epoch": 15.098752598752599, "grad_norm": 0.38488486409187317, "learning_rate": 8.604344359833472e-06, "loss": 0.0968, "num_input_tokens_seen": 5537920, "step": 29050 }, { "epoch": 15.10135135135135, "grad_norm": 1.5727421045303345, "learning_rate": 8.595785951732063e-06, "loss": 0.189, "num_input_tokens_seen": 5538880, "step": 29055 }, { "epoch": 15.103950103950105, "grad_norm": 0.6972281336784363, "learning_rate": 8.587230918368394e-06, "loss": 0.1372, "num_input_tokens_seen": 5539840, "step": 29060 }, { "epoch": 15.106548856548857, "grad_norm": 1.2373340129852295, "learning_rate": 8.57867926150244e-06, "loss": 0.0894, "num_input_tokens_seen": 5540832, "step": 29065 }, { "epoch": 15.109147609147609, "grad_norm": 0.40369170904159546, "learning_rate": 8.570130982893476e-06, "loss": 0.1284, "num_input_tokens_seen": 5541760, "step": 29070 }, { "epoch": 15.111746361746361, "grad_norm": 0.3641625940799713, "learning_rate": 8.561586084300096e-06, "loss": 0.1398, "num_input_tokens_seen": 5542656, "step": 29075 }, { "epoch": 15.114345114345115, "grad_norm": 0.8897615075111389, "learning_rate": 8.55304456748019e-06, "loss": 0.0757, "num_input_tokens_seen": 5543616, "step": 29080 }, { "epoch": 15.116943866943867, "grad_norm": 0.5702231526374817, "learning_rate": 8.544506434190946e-06, "loss": 0.1433, "num_input_tokens_seen": 5544640, "step": 29085 }, { "epoch": 15.119542619542619, "grad_norm": 0.24912717938423157, "learning_rate": 8.535971686188851e-06, "loss": 0.2439, "num_input_tokens_seen": 5545600, "step": 29090 }, { "epoch": 15.122141372141373, "grad_norm": 0.614629864692688, "learning_rate": 8.52744032522973e-06, "loss": 0.0558, "num_input_tokens_seen": 5546464, "step": 29095 }, { "epoch": 15.124740124740125, "grad_norm": 0.582880973815918, "learning_rate": 8.518912353068672e-06, "loss": 0.1211, "num_input_tokens_seen": 5547360, "step": 29100 }, { "epoch": 15.127338877338877, "grad_norm": 0.9859534502029419, "learning_rate": 8.510387771460088e-06, "loss": 0.158, "num_input_tokens_seen": 5548352, "step": 29105 }, { "epoch": 15.12993762993763, "grad_norm": 0.9286714196205139, "learning_rate": 8.501866582157677e-06, "loss": 0.0656, "num_input_tokens_seen": 5549376, "step": 29110 }, { "epoch": 15.132536382536383, "grad_norm": 0.8371067047119141, "learning_rate": 8.493348786914476e-06, "loss": 0.1437, "num_input_tokens_seen": 5550304, "step": 29115 }, { "epoch": 15.135135135135135, "grad_norm": 0.36304253339767456, "learning_rate": 8.484834387482776e-06, "loss": 0.1248, "num_input_tokens_seen": 5551296, "step": 29120 }, { "epoch": 15.137733887733887, "grad_norm": 0.2039792835712433, "learning_rate": 8.47632338561419e-06, "loss": 0.0635, "num_input_tokens_seen": 5552192, "step": 29125 }, { "epoch": 15.140332640332641, "grad_norm": 0.45433202385902405, "learning_rate": 8.467815783059652e-06, "loss": 0.1597, "num_input_tokens_seen": 5553184, "step": 29130 }, { "epoch": 15.142931392931393, "grad_norm": 0.1470336616039276, "learning_rate": 8.459311581569365e-06, "loss": 0.0932, "num_input_tokens_seen": 5554240, "step": 29135 }, { "epoch": 15.145530145530145, "grad_norm": 1.4497579336166382, "learning_rate": 8.45081078289285e-06, "loss": 0.2656, "num_input_tokens_seen": 5555232, "step": 29140 }, { "epoch": 15.148128898128897, "grad_norm": 0.1467188447713852, "learning_rate": 8.442313388778918e-06, "loss": 0.0944, "num_input_tokens_seen": 5556256, "step": 29145 }, { "epoch": 15.150727650727651, "grad_norm": 1.230019450187683, "learning_rate": 8.433819400975701e-06, "loss": 0.2046, "num_input_tokens_seen": 5557216, "step": 29150 }, { "epoch": 15.153326403326403, "grad_norm": 0.6418245434761047, "learning_rate": 8.425328821230601e-06, "loss": 0.1177, "num_input_tokens_seen": 5558144, "step": 29155 }, { "epoch": 15.155925155925155, "grad_norm": 0.5117676258087158, "learning_rate": 8.41684165129033e-06, "loss": 0.1222, "num_input_tokens_seen": 5559136, "step": 29160 }, { "epoch": 15.15852390852391, "grad_norm": 0.3377019166946411, "learning_rate": 8.408357892900914e-06, "loss": 0.1235, "num_input_tokens_seen": 5560032, "step": 29165 }, { "epoch": 15.161122661122661, "grad_norm": 0.3295118808746338, "learning_rate": 8.39987754780766e-06, "loss": 0.0533, "num_input_tokens_seen": 5560960, "step": 29170 }, { "epoch": 15.163721413721413, "grad_norm": 0.24228650331497192, "learning_rate": 8.391400617755175e-06, "loss": 0.1895, "num_input_tokens_seen": 5561984, "step": 29175 }, { "epoch": 15.166320166320165, "grad_norm": 0.08316980302333832, "learning_rate": 8.382927104487365e-06, "loss": 0.0715, "num_input_tokens_seen": 5562880, "step": 29180 }, { "epoch": 15.16891891891892, "grad_norm": 0.6918843388557434, "learning_rate": 8.374457009747433e-06, "loss": 0.1212, "num_input_tokens_seen": 5563872, "step": 29185 }, { "epoch": 15.171517671517671, "grad_norm": 0.08772148936986923, "learning_rate": 8.365990335277879e-06, "loss": 0.0413, "num_input_tokens_seen": 5564864, "step": 29190 }, { "epoch": 15.174116424116423, "grad_norm": 1.1378569602966309, "learning_rate": 8.357527082820493e-06, "loss": 0.0936, "num_input_tokens_seen": 5565824, "step": 29195 }, { "epoch": 15.176715176715177, "grad_norm": 0.7915006875991821, "learning_rate": 8.349067254116385e-06, "loss": 0.2112, "num_input_tokens_seen": 5566784, "step": 29200 }, { "epoch": 15.17931392931393, "grad_norm": 1.1365382671356201, "learning_rate": 8.340610850905936e-06, "loss": 0.1853, "num_input_tokens_seen": 5567712, "step": 29205 }, { "epoch": 15.181912681912682, "grad_norm": 0.8651390075683594, "learning_rate": 8.332157874928826e-06, "loss": 0.0806, "num_input_tokens_seen": 5568608, "step": 29210 }, { "epoch": 15.184511434511435, "grad_norm": 0.4480120539665222, "learning_rate": 8.323708327924026e-06, "loss": 0.1829, "num_input_tokens_seen": 5569600, "step": 29215 }, { "epoch": 15.187110187110187, "grad_norm": 0.7561933398246765, "learning_rate": 8.31526221162983e-06, "loss": 0.2046, "num_input_tokens_seen": 5570528, "step": 29220 }, { "epoch": 15.18970893970894, "grad_norm": 0.11489249020814896, "learning_rate": 8.30681952778379e-06, "loss": 0.0511, "num_input_tokens_seen": 5571488, "step": 29225 }, { "epoch": 15.192307692307692, "grad_norm": 0.09789285808801651, "learning_rate": 8.298380278122772e-06, "loss": 0.0611, "num_input_tokens_seen": 5572416, "step": 29230 }, { "epoch": 15.194906444906445, "grad_norm": 0.9453873634338379, "learning_rate": 8.289944464382924e-06, "loss": 0.2376, "num_input_tokens_seen": 5573344, "step": 29235 }, { "epoch": 15.197505197505198, "grad_norm": 0.9730120897293091, "learning_rate": 8.281512088299707e-06, "loss": 0.2052, "num_input_tokens_seen": 5574336, "step": 29240 }, { "epoch": 15.20010395010395, "grad_norm": 0.5999734401702881, "learning_rate": 8.273083151607861e-06, "loss": 0.0468, "num_input_tokens_seen": 5575264, "step": 29245 }, { "epoch": 15.202702702702704, "grad_norm": 0.07689830660820007, "learning_rate": 8.264657656041397e-06, "loss": 0.0229, "num_input_tokens_seen": 5576128, "step": 29250 }, { "epoch": 15.205301455301456, "grad_norm": 0.8759334087371826, "learning_rate": 8.256235603333665e-06, "loss": 0.1044, "num_input_tokens_seen": 5577184, "step": 29255 }, { "epoch": 15.207900207900208, "grad_norm": 0.342507928609848, "learning_rate": 8.24781699521727e-06, "loss": 0.1335, "num_input_tokens_seen": 5578080, "step": 29260 }, { "epoch": 15.21049896049896, "grad_norm": 1.148513913154602, "learning_rate": 8.239401833424127e-06, "loss": 0.1289, "num_input_tokens_seen": 5579072, "step": 29265 }, { "epoch": 15.213097713097714, "grad_norm": 0.1567610800266266, "learning_rate": 8.230990119685419e-06, "loss": 0.1439, "num_input_tokens_seen": 5579968, "step": 29270 }, { "epoch": 15.215696465696466, "grad_norm": 0.9425705075263977, "learning_rate": 8.222581855731658e-06, "loss": 0.0894, "num_input_tokens_seen": 5580896, "step": 29275 }, { "epoch": 15.218295218295218, "grad_norm": 0.09888747334480286, "learning_rate": 8.214177043292617e-06, "loss": 0.0856, "num_input_tokens_seen": 5581824, "step": 29280 }, { "epoch": 15.220893970893972, "grad_norm": 1.1519039869308472, "learning_rate": 8.205775684097356e-06, "loss": 0.1648, "num_input_tokens_seen": 5582688, "step": 29285 }, { "epoch": 15.223492723492724, "grad_norm": 0.8742700815200806, "learning_rate": 8.197377779874249e-06, "loss": 0.1377, "num_input_tokens_seen": 5583680, "step": 29290 }, { "epoch": 15.226091476091476, "grad_norm": 0.8849685788154602, "learning_rate": 8.188983332350938e-06, "loss": 0.125, "num_input_tokens_seen": 5584704, "step": 29295 }, { "epoch": 15.228690228690228, "grad_norm": 0.657152533531189, "learning_rate": 8.180592343254361e-06, "loss": 0.1097, "num_input_tokens_seen": 5585632, "step": 29300 }, { "epoch": 15.231288981288982, "grad_norm": 0.6059194207191467, "learning_rate": 8.172204814310742e-06, "loss": 0.0868, "num_input_tokens_seen": 5586496, "step": 29305 }, { "epoch": 15.233887733887734, "grad_norm": 0.22177086770534515, "learning_rate": 8.16382074724561e-06, "loss": 0.1659, "num_input_tokens_seen": 5587488, "step": 29310 }, { "epoch": 15.236486486486486, "grad_norm": 0.915894091129303, "learning_rate": 8.155440143783749e-06, "loss": 0.159, "num_input_tokens_seen": 5588352, "step": 29315 }, { "epoch": 15.23908523908524, "grad_norm": 0.45808714628219604, "learning_rate": 8.147063005649246e-06, "loss": 0.1018, "num_input_tokens_seen": 5589344, "step": 29320 }, { "epoch": 15.241683991683992, "grad_norm": 0.2859543263912201, "learning_rate": 8.138689334565497e-06, "loss": 0.0618, "num_input_tokens_seen": 5590240, "step": 29325 }, { "epoch": 15.244282744282744, "grad_norm": 0.6717342138290405, "learning_rate": 8.130319132255156e-06, "loss": 0.1213, "num_input_tokens_seen": 5591200, "step": 29330 }, { "epoch": 15.246881496881496, "grad_norm": 0.8275039792060852, "learning_rate": 8.121952400440176e-06, "loss": 0.2592, "num_input_tokens_seen": 5592160, "step": 29335 }, { "epoch": 15.24948024948025, "grad_norm": 1.1339555978775024, "learning_rate": 8.113589140841782e-06, "loss": 0.1206, "num_input_tokens_seen": 5593056, "step": 29340 }, { "epoch": 15.252079002079002, "grad_norm": 0.9651682376861572, "learning_rate": 8.105229355180511e-06, "loss": 0.1727, "num_input_tokens_seen": 5593952, "step": 29345 }, { "epoch": 15.254677754677754, "grad_norm": 2.162602424621582, "learning_rate": 8.096873045176162e-06, "loss": 0.1368, "num_input_tokens_seen": 5594912, "step": 29350 }, { "epoch": 15.257276507276508, "grad_norm": 0.7984334826469421, "learning_rate": 8.088520212547831e-06, "loss": 0.108, "num_input_tokens_seen": 5595904, "step": 29355 }, { "epoch": 15.25987525987526, "grad_norm": 1.7778583765029907, "learning_rate": 8.080170859013883e-06, "loss": 0.1431, "num_input_tokens_seen": 5596864, "step": 29360 }, { "epoch": 15.262474012474012, "grad_norm": 0.10547000169754028, "learning_rate": 8.071824986291998e-06, "loss": 0.1035, "num_input_tokens_seen": 5597856, "step": 29365 }, { "epoch": 15.265072765072764, "grad_norm": 0.24251039326190948, "learning_rate": 8.063482596099109e-06, "loss": 0.117, "num_input_tokens_seen": 5598752, "step": 29370 }, { "epoch": 15.267671517671518, "grad_norm": 0.615735650062561, "learning_rate": 8.055143690151449e-06, "loss": 0.1758, "num_input_tokens_seen": 5599744, "step": 29375 }, { "epoch": 15.27027027027027, "grad_norm": 0.5386892557144165, "learning_rate": 8.046808270164527e-06, "loss": 0.2715, "num_input_tokens_seen": 5600704, "step": 29380 }, { "epoch": 15.272869022869022, "grad_norm": 0.8556243777275085, "learning_rate": 8.038476337853137e-06, "loss": 0.0945, "num_input_tokens_seen": 5601664, "step": 29385 }, { "epoch": 15.275467775467776, "grad_norm": 0.2824750542640686, "learning_rate": 8.030147894931356e-06, "loss": 0.0971, "num_input_tokens_seen": 5602624, "step": 29390 }, { "epoch": 15.278066528066528, "grad_norm": 0.08140597492456436, "learning_rate": 8.021822943112538e-06, "loss": 0.0907, "num_input_tokens_seen": 5603488, "step": 29395 }, { "epoch": 15.28066528066528, "grad_norm": 0.13503305613994598, "learning_rate": 8.013501484109336e-06, "loss": 0.0499, "num_input_tokens_seen": 5604448, "step": 29400 }, { "epoch": 15.283264033264032, "grad_norm": 0.30485230684280396, "learning_rate": 8.005183519633669e-06, "loss": 0.0593, "num_input_tokens_seen": 5605344, "step": 29405 }, { "epoch": 15.285862785862786, "grad_norm": 0.2459203451871872, "learning_rate": 7.996869051396727e-06, "loss": 0.0943, "num_input_tokens_seen": 5606240, "step": 29410 }, { "epoch": 15.288461538461538, "grad_norm": 0.5054944157600403, "learning_rate": 7.988558081109016e-06, "loss": 0.1135, "num_input_tokens_seen": 5607168, "step": 29415 }, { "epoch": 15.29106029106029, "grad_norm": 1.720525860786438, "learning_rate": 7.98025061048029e-06, "loss": 0.2445, "num_input_tokens_seen": 5608032, "step": 29420 }, { "epoch": 15.293659043659044, "grad_norm": 0.832234799861908, "learning_rate": 7.97194664121959e-06, "loss": 0.1997, "num_input_tokens_seen": 5608896, "step": 29425 }, { "epoch": 15.296257796257796, "grad_norm": 0.0997917503118515, "learning_rate": 7.963646175035239e-06, "loss": 0.2369, "num_input_tokens_seen": 5609760, "step": 29430 }, { "epoch": 15.298856548856548, "grad_norm": 0.5055224299430847, "learning_rate": 7.955349213634852e-06, "loss": 0.1641, "num_input_tokens_seen": 5610688, "step": 29435 }, { "epoch": 15.301455301455302, "grad_norm": 1.1794617176055908, "learning_rate": 7.947055758725316e-06, "loss": 0.2251, "num_input_tokens_seen": 5611680, "step": 29440 }, { "epoch": 15.304054054054054, "grad_norm": 0.638657808303833, "learning_rate": 7.938765812012763e-06, "loss": 0.1366, "num_input_tokens_seen": 5612672, "step": 29445 }, { "epoch": 15.306652806652806, "grad_norm": 0.08233864605426788, "learning_rate": 7.93047937520266e-06, "loss": 0.1104, "num_input_tokens_seen": 5613600, "step": 29450 }, { "epoch": 15.309251559251559, "grad_norm": 0.7054429650306702, "learning_rate": 7.922196449999717e-06, "loss": 0.1264, "num_input_tokens_seen": 5614560, "step": 29455 }, { "epoch": 15.311850311850312, "grad_norm": 1.0808721780776978, "learning_rate": 7.913917038107923e-06, "loss": 0.0662, "num_input_tokens_seen": 5615520, "step": 29460 }, { "epoch": 15.314449064449065, "grad_norm": 0.5366723537445068, "learning_rate": 7.905641141230552e-06, "loss": 0.1666, "num_input_tokens_seen": 5616480, "step": 29465 }, { "epoch": 15.317047817047817, "grad_norm": 0.5787447094917297, "learning_rate": 7.897368761070161e-06, "loss": 0.0308, "num_input_tokens_seen": 5617440, "step": 29470 }, { "epoch": 15.31964656964657, "grad_norm": 0.38330769538879395, "learning_rate": 7.889099899328576e-06, "loss": 0.0714, "num_input_tokens_seen": 5618400, "step": 29475 }, { "epoch": 15.322245322245323, "grad_norm": 0.230824813246727, "learning_rate": 7.88083455770689e-06, "loss": 0.0752, "num_input_tokens_seen": 5619360, "step": 29480 }, { "epoch": 15.324844074844075, "grad_norm": 0.5296244025230408, "learning_rate": 7.872572737905476e-06, "loss": 0.1011, "num_input_tokens_seen": 5620320, "step": 29485 }, { "epoch": 15.327442827442827, "grad_norm": 1.4266420602798462, "learning_rate": 7.864314441624004e-06, "loss": 0.1217, "num_input_tokens_seen": 5621280, "step": 29490 }, { "epoch": 15.33004158004158, "grad_norm": 0.22385448217391968, "learning_rate": 7.856059670561397e-06, "loss": 0.1305, "num_input_tokens_seen": 5622272, "step": 29495 }, { "epoch": 15.332640332640333, "grad_norm": 0.8977068662643433, "learning_rate": 7.847808426415854e-06, "loss": 0.0727, "num_input_tokens_seen": 5623200, "step": 29500 }, { "epoch": 15.335239085239085, "grad_norm": 0.10357112437486649, "learning_rate": 7.839560710884852e-06, "loss": 0.1568, "num_input_tokens_seen": 5624160, "step": 29505 }, { "epoch": 15.337837837837839, "grad_norm": 0.7743487358093262, "learning_rate": 7.831316525665147e-06, "loss": 0.1609, "num_input_tokens_seen": 5625120, "step": 29510 }, { "epoch": 15.34043659043659, "grad_norm": 1.9928525686264038, "learning_rate": 7.823075872452761e-06, "loss": 0.1488, "num_input_tokens_seen": 5626048, "step": 29515 }, { "epoch": 15.343035343035343, "grad_norm": 1.3571760654449463, "learning_rate": 7.814838752942985e-06, "loss": 0.1206, "num_input_tokens_seen": 5626976, "step": 29520 }, { "epoch": 15.345634095634095, "grad_norm": 0.812203049659729, "learning_rate": 7.806605168830412e-06, "loss": 0.1123, "num_input_tokens_seen": 5628032, "step": 29525 }, { "epoch": 15.348232848232849, "grad_norm": 0.19524641335010529, "learning_rate": 7.798375121808869e-06, "loss": 0.0672, "num_input_tokens_seen": 5628896, "step": 29530 }, { "epoch": 15.3508316008316, "grad_norm": 0.6432520747184753, "learning_rate": 7.790148613571468e-06, "loss": 0.1511, "num_input_tokens_seen": 5629760, "step": 29535 }, { "epoch": 15.353430353430353, "grad_norm": 0.10246820002794266, "learning_rate": 7.78192564581062e-06, "loss": 0.0297, "num_input_tokens_seen": 5630688, "step": 29540 }, { "epoch": 15.356029106029107, "grad_norm": 0.6106821894645691, "learning_rate": 7.77370622021797e-06, "loss": 0.1319, "num_input_tokens_seen": 5631616, "step": 29545 }, { "epoch": 15.358627858627859, "grad_norm": 0.9465338587760925, "learning_rate": 7.765490338484455e-06, "loss": 0.2359, "num_input_tokens_seen": 5632608, "step": 29550 }, { "epoch": 15.361226611226611, "grad_norm": 0.5717630386352539, "learning_rate": 7.757278002300265e-06, "loss": 0.0587, "num_input_tokens_seen": 5633600, "step": 29555 }, { "epoch": 15.363825363825363, "grad_norm": 0.7378389239311218, "learning_rate": 7.749069213354892e-06, "loss": 0.144, "num_input_tokens_seen": 5634592, "step": 29560 }, { "epoch": 15.366424116424117, "grad_norm": 0.12714023888111115, "learning_rate": 7.74086397333707e-06, "loss": 0.0332, "num_input_tokens_seen": 5635488, "step": 29565 }, { "epoch": 15.369022869022869, "grad_norm": 0.3699682056903839, "learning_rate": 7.732662283934816e-06, "loss": 0.2088, "num_input_tokens_seen": 5636448, "step": 29570 }, { "epoch": 15.371621621621621, "grad_norm": 0.30529966950416565, "learning_rate": 7.724464146835408e-06, "loss": 0.0472, "num_input_tokens_seen": 5637408, "step": 29575 }, { "epoch": 15.374220374220375, "grad_norm": 0.19529606401920319, "learning_rate": 7.716269563725402e-06, "loss": 0.161, "num_input_tokens_seen": 5638368, "step": 29580 }, { "epoch": 15.376819126819127, "grad_norm": 0.6400333642959595, "learning_rate": 7.708078536290616e-06, "loss": 0.1204, "num_input_tokens_seen": 5639328, "step": 29585 }, { "epoch": 15.379417879417879, "grad_norm": 0.5877010822296143, "learning_rate": 7.699891066216132e-06, "loss": 0.0498, "num_input_tokens_seen": 5640288, "step": 29590 }, { "epoch": 15.382016632016631, "grad_norm": 0.5648106336593628, "learning_rate": 7.691707155186328e-06, "loss": 0.0919, "num_input_tokens_seen": 5641184, "step": 29595 }, { "epoch": 15.384615384615385, "grad_norm": 1.4028371572494507, "learning_rate": 7.68352680488482e-06, "loss": 0.1607, "num_input_tokens_seen": 5642144, "step": 29600 }, { "epoch": 15.387214137214137, "grad_norm": 0.4774419665336609, "learning_rate": 7.675350016994496e-06, "loss": 0.1674, "num_input_tokens_seen": 5643040, "step": 29605 }, { "epoch": 15.38981288981289, "grad_norm": 0.6745354533195496, "learning_rate": 7.667176793197511e-06, "loss": 0.0895, "num_input_tokens_seen": 5644064, "step": 29610 }, { "epoch": 15.392411642411643, "grad_norm": 0.5502936244010925, "learning_rate": 7.659007135175308e-06, "loss": 0.1753, "num_input_tokens_seen": 5644928, "step": 29615 }, { "epoch": 15.395010395010395, "grad_norm": 0.6638399958610535, "learning_rate": 7.650841044608572e-06, "loss": 0.1408, "num_input_tokens_seen": 5645856, "step": 29620 }, { "epoch": 15.397609147609147, "grad_norm": 0.2911084294319153, "learning_rate": 7.642678523177266e-06, "loss": 0.0719, "num_input_tokens_seen": 5646848, "step": 29625 }, { "epoch": 15.4002079002079, "grad_norm": 0.4711270034313202, "learning_rate": 7.634519572560603e-06, "loss": 0.0446, "num_input_tokens_seen": 5647776, "step": 29630 }, { "epoch": 15.402806652806653, "grad_norm": 0.6590832471847534, "learning_rate": 7.626364194437099e-06, "loss": 0.1938, "num_input_tokens_seen": 5648736, "step": 29635 }, { "epoch": 15.405405405405405, "grad_norm": 0.3881782591342926, "learning_rate": 7.618212390484486e-06, "loss": 0.1085, "num_input_tokens_seen": 5649664, "step": 29640 }, { "epoch": 15.408004158004157, "grad_norm": 0.36250925064086914, "learning_rate": 7.610064162379782e-06, "loss": 0.1308, "num_input_tokens_seen": 5650560, "step": 29645 }, { "epoch": 15.410602910602911, "grad_norm": 0.3458541929721832, "learning_rate": 7.601919511799291e-06, "loss": 0.06, "num_input_tokens_seen": 5651456, "step": 29650 }, { "epoch": 15.413201663201663, "grad_norm": 0.2560766339302063, "learning_rate": 7.59377844041855e-06, "loss": 0.126, "num_input_tokens_seen": 5652416, "step": 29655 }, { "epoch": 15.415800415800415, "grad_norm": 1.2081804275512695, "learning_rate": 7.585640949912368e-06, "loss": 0.2969, "num_input_tokens_seen": 5653344, "step": 29660 }, { "epoch": 15.41839916839917, "grad_norm": 0.21377773582935333, "learning_rate": 7.577507041954832e-06, "loss": 0.0637, "num_input_tokens_seen": 5654304, "step": 29665 }, { "epoch": 15.420997920997921, "grad_norm": 0.2152910977602005, "learning_rate": 7.569376718219276e-06, "loss": 0.1424, "num_input_tokens_seen": 5655264, "step": 29670 }, { "epoch": 15.423596673596673, "grad_norm": 0.7779706120491028, "learning_rate": 7.561249980378301e-06, "loss": 0.2132, "num_input_tokens_seen": 5656288, "step": 29675 }, { "epoch": 15.426195426195425, "grad_norm": 0.3381095230579376, "learning_rate": 7.553126830103763e-06, "loss": 0.1231, "num_input_tokens_seen": 5657280, "step": 29680 }, { "epoch": 15.42879417879418, "grad_norm": 1.601820945739746, "learning_rate": 7.5450072690668e-06, "loss": 0.0493, "num_input_tokens_seen": 5658208, "step": 29685 }, { "epoch": 15.431392931392931, "grad_norm": 0.3545372188091278, "learning_rate": 7.536891298937795e-06, "loss": 0.123, "num_input_tokens_seen": 5659104, "step": 29690 }, { "epoch": 15.433991683991684, "grad_norm": 1.4281342029571533, "learning_rate": 7.528778921386395e-06, "loss": 0.2361, "num_input_tokens_seen": 5660128, "step": 29695 }, { "epoch": 15.436590436590437, "grad_norm": 0.8163081407546997, "learning_rate": 7.520670138081509e-06, "loss": 0.1655, "num_input_tokens_seen": 5661120, "step": 29700 }, { "epoch": 15.43918918918919, "grad_norm": 0.21764719486236572, "learning_rate": 7.5125649506913085e-06, "loss": 0.0842, "num_input_tokens_seen": 5662080, "step": 29705 }, { "epoch": 15.441787941787942, "grad_norm": 0.37116408348083496, "learning_rate": 7.504463360883221e-06, "loss": 0.113, "num_input_tokens_seen": 5663104, "step": 29710 }, { "epoch": 15.444386694386694, "grad_norm": 0.12928473949432373, "learning_rate": 7.496365370323932e-06, "loss": 0.1271, "num_input_tokens_seen": 5664128, "step": 29715 }, { "epoch": 15.446985446985448, "grad_norm": 0.5873493552207947, "learning_rate": 7.488270980679404e-06, "loss": 0.1644, "num_input_tokens_seen": 5665152, "step": 29720 }, { "epoch": 15.4495841995842, "grad_norm": 0.33695825934410095, "learning_rate": 7.480180193614841e-06, "loss": 0.0931, "num_input_tokens_seen": 5666112, "step": 29725 }, { "epoch": 15.452182952182952, "grad_norm": 1.3202259540557861, "learning_rate": 7.4720930107947065e-06, "loss": 0.1309, "num_input_tokens_seen": 5667008, "step": 29730 }, { "epoch": 15.454781704781706, "grad_norm": 0.8790745735168457, "learning_rate": 7.464009433882723e-06, "loss": 0.1796, "num_input_tokens_seen": 5668032, "step": 29735 }, { "epoch": 15.457380457380458, "grad_norm": 0.3826139569282532, "learning_rate": 7.455929464541889e-06, "loss": 0.1389, "num_input_tokens_seen": 5668992, "step": 29740 }, { "epoch": 15.45997920997921, "grad_norm": 0.12556761503219604, "learning_rate": 7.447853104434438e-06, "loss": 0.0639, "num_input_tokens_seen": 5669920, "step": 29745 }, { "epoch": 15.462577962577962, "grad_norm": 0.10335466265678406, "learning_rate": 7.439780355221873e-06, "loss": 0.1442, "num_input_tokens_seen": 5670976, "step": 29750 }, { "epoch": 15.465176715176716, "grad_norm": 0.42207396030426025, "learning_rate": 7.431711218564938e-06, "loss": 0.1128, "num_input_tokens_seen": 5671904, "step": 29755 }, { "epoch": 15.467775467775468, "grad_norm": 0.1296418011188507, "learning_rate": 7.423645696123665e-06, "loss": 0.0386, "num_input_tokens_seen": 5672928, "step": 29760 }, { "epoch": 15.47037422037422, "grad_norm": 0.8672767281532288, "learning_rate": 7.415583789557326e-06, "loss": 0.1774, "num_input_tokens_seen": 5673856, "step": 29765 }, { "epoch": 15.472972972972974, "grad_norm": 0.5673983693122864, "learning_rate": 7.4075255005244205e-06, "loss": 0.1382, "num_input_tokens_seen": 5674784, "step": 29770 }, { "epoch": 15.475571725571726, "grad_norm": 1.0020374059677124, "learning_rate": 7.399470830682756e-06, "loss": 0.1027, "num_input_tokens_seen": 5675776, "step": 29775 }, { "epoch": 15.478170478170478, "grad_norm": 0.5491193532943726, "learning_rate": 7.39141978168936e-06, "loss": 0.1387, "num_input_tokens_seen": 5676736, "step": 29780 }, { "epoch": 15.48076923076923, "grad_norm": 0.23937880992889404, "learning_rate": 7.383372355200522e-06, "loss": 0.0628, "num_input_tokens_seen": 5677728, "step": 29785 }, { "epoch": 15.483367983367984, "grad_norm": 0.20461882650852203, "learning_rate": 7.375328552871799e-06, "loss": 0.14, "num_input_tokens_seen": 5678656, "step": 29790 }, { "epoch": 15.485966735966736, "grad_norm": 0.8240512609481812, "learning_rate": 7.367288376357989e-06, "loss": 0.1226, "num_input_tokens_seen": 5679584, "step": 29795 }, { "epoch": 15.488565488565488, "grad_norm": 0.7378859519958496, "learning_rate": 7.359251827313149e-06, "loss": 0.1752, "num_input_tokens_seen": 5680544, "step": 29800 }, { "epoch": 15.491164241164242, "grad_norm": 1.505987524986267, "learning_rate": 7.351218907390578e-06, "loss": 0.1091, "num_input_tokens_seen": 5681504, "step": 29805 }, { "epoch": 15.493762993762994, "grad_norm": 0.4747849106788635, "learning_rate": 7.343189618242854e-06, "loss": 0.0319, "num_input_tokens_seen": 5682496, "step": 29810 }, { "epoch": 15.496361746361746, "grad_norm": 0.36140355467796326, "learning_rate": 7.335163961521787e-06, "loss": 0.1692, "num_input_tokens_seen": 5683424, "step": 29815 }, { "epoch": 15.4989604989605, "grad_norm": 0.6475513577461243, "learning_rate": 7.327141938878446e-06, "loss": 0.072, "num_input_tokens_seen": 5684288, "step": 29820 }, { "epoch": 15.501559251559252, "grad_norm": 0.5451962947845459, "learning_rate": 7.3191235519631446e-06, "loss": 0.1027, "num_input_tokens_seen": 5685280, "step": 29825 }, { "epoch": 15.504158004158004, "grad_norm": 1.6280796527862549, "learning_rate": 7.311108802425479e-06, "loss": 0.1967, "num_input_tokens_seen": 5686208, "step": 29830 }, { "epoch": 15.506756756756756, "grad_norm": 0.2440042495727539, "learning_rate": 7.3030976919142516e-06, "loss": 0.1227, "num_input_tokens_seen": 5687232, "step": 29835 }, { "epoch": 15.50935550935551, "grad_norm": 0.03648039698600769, "learning_rate": 7.295090222077536e-06, "loss": 0.0607, "num_input_tokens_seen": 5688096, "step": 29840 }, { "epoch": 15.511954261954262, "grad_norm": 0.9020504951477051, "learning_rate": 7.2870863945626785e-06, "loss": 0.1305, "num_input_tokens_seen": 5689056, "step": 29845 }, { "epoch": 15.514553014553014, "grad_norm": 0.26483291387557983, "learning_rate": 7.279086211016245e-06, "loss": 0.19, "num_input_tokens_seen": 5689984, "step": 29850 }, { "epoch": 15.517151767151766, "grad_norm": 0.8428797721862793, "learning_rate": 7.2710896730840696e-06, "loss": 0.0792, "num_input_tokens_seen": 5690944, "step": 29855 }, { "epoch": 15.51975051975052, "grad_norm": 0.5877804160118103, "learning_rate": 7.263096782411219e-06, "loss": 0.1457, "num_input_tokens_seen": 5691904, "step": 29860 }, { "epoch": 15.522349272349272, "grad_norm": 0.7489648461341858, "learning_rate": 7.255107540642037e-06, "loss": 0.2704, "num_input_tokens_seen": 5692864, "step": 29865 }, { "epoch": 15.524948024948024, "grad_norm": 0.29905539751052856, "learning_rate": 7.2471219494200965e-06, "loss": 0.0469, "num_input_tokens_seen": 5693728, "step": 29870 }, { "epoch": 15.527546777546778, "grad_norm": 0.6592094302177429, "learning_rate": 7.2391400103882206e-06, "loss": 0.0824, "num_input_tokens_seen": 5694656, "step": 29875 }, { "epoch": 15.53014553014553, "grad_norm": 0.6942042112350464, "learning_rate": 7.231161725188476e-06, "loss": 0.1175, "num_input_tokens_seen": 5695648, "step": 29880 }, { "epoch": 15.532744282744282, "grad_norm": 0.17812179028987885, "learning_rate": 7.223187095462206e-06, "loss": 0.147, "num_input_tokens_seen": 5696640, "step": 29885 }, { "epoch": 15.535343035343036, "grad_norm": 0.5466296076774597, "learning_rate": 7.2152161228499686e-06, "loss": 0.0846, "num_input_tokens_seen": 5697632, "step": 29890 }, { "epoch": 15.537941787941788, "grad_norm": 0.6585956811904907, "learning_rate": 7.207248808991587e-06, "loss": 0.0686, "num_input_tokens_seen": 5698560, "step": 29895 }, { "epoch": 15.54054054054054, "grad_norm": 0.2634575068950653, "learning_rate": 7.1992851555261255e-06, "loss": 0.1285, "num_input_tokens_seen": 5699552, "step": 29900 }, { "epoch": 15.543139293139292, "grad_norm": 1.019506812095642, "learning_rate": 7.1913251640918975e-06, "loss": 0.1618, "num_input_tokens_seen": 5700416, "step": 29905 }, { "epoch": 15.545738045738046, "grad_norm": 1.257957935333252, "learning_rate": 7.183368836326456e-06, "loss": 0.1939, "num_input_tokens_seen": 5701344, "step": 29910 }, { "epoch": 15.548336798336798, "grad_norm": 0.28105372190475464, "learning_rate": 7.175416173866622e-06, "loss": 0.1593, "num_input_tokens_seen": 5702240, "step": 29915 }, { "epoch": 15.55093555093555, "grad_norm": 1.7319917678833008, "learning_rate": 7.1674671783484445e-06, "loss": 0.1768, "num_input_tokens_seen": 5703136, "step": 29920 }, { "epoch": 15.553534303534304, "grad_norm": 0.1655505895614624, "learning_rate": 7.159521851407211e-06, "loss": 0.1594, "num_input_tokens_seen": 5704096, "step": 29925 }, { "epoch": 15.556133056133056, "grad_norm": 0.04070983827114105, "learning_rate": 7.151580194677468e-06, "loss": 0.2806, "num_input_tokens_seen": 5704992, "step": 29930 }, { "epoch": 15.558731808731808, "grad_norm": 0.7147299647331238, "learning_rate": 7.1436422097930115e-06, "loss": 0.1405, "num_input_tokens_seen": 5705952, "step": 29935 }, { "epoch": 15.56133056133056, "grad_norm": 0.8916382789611816, "learning_rate": 7.13570789838687e-06, "loss": 0.1353, "num_input_tokens_seen": 5706944, "step": 29940 }, { "epoch": 15.563929313929314, "grad_norm": 0.3618457019329071, "learning_rate": 7.127777262091317e-06, "loss": 0.0631, "num_input_tokens_seen": 5707872, "step": 29945 }, { "epoch": 15.566528066528067, "grad_norm": 0.4174746572971344, "learning_rate": 7.11985030253787e-06, "loss": 0.0305, "num_input_tokens_seen": 5708864, "step": 29950 }, { "epoch": 15.569126819126819, "grad_norm": 0.5967804193496704, "learning_rate": 7.1119270213573066e-06, "loss": 0.0817, "num_input_tokens_seen": 5709824, "step": 29955 }, { "epoch": 15.571725571725572, "grad_norm": 0.7720668315887451, "learning_rate": 7.104007420179629e-06, "loss": 0.1367, "num_input_tokens_seen": 5710752, "step": 29960 }, { "epoch": 15.574324324324325, "grad_norm": 0.2814751863479614, "learning_rate": 7.0960915006340836e-06, "loss": 0.128, "num_input_tokens_seen": 5711744, "step": 29965 }, { "epoch": 15.576923076923077, "grad_norm": 0.8853018283843994, "learning_rate": 7.088179264349165e-06, "loss": 0.16, "num_input_tokens_seen": 5712704, "step": 29970 }, { "epoch": 15.579521829521829, "grad_norm": 0.7859911918640137, "learning_rate": 7.080270712952611e-06, "loss": 0.112, "num_input_tokens_seen": 5713664, "step": 29975 }, { "epoch": 15.582120582120583, "grad_norm": 0.11939108371734619, "learning_rate": 7.0723658480714e-06, "loss": 0.1221, "num_input_tokens_seen": 5714656, "step": 29980 }, { "epoch": 15.584719334719335, "grad_norm": 0.20815055072307587, "learning_rate": 7.064464671331739e-06, "loss": 0.3241, "num_input_tokens_seen": 5715712, "step": 29985 }, { "epoch": 15.587318087318087, "grad_norm": 0.85836261510849, "learning_rate": 7.056567184359109e-06, "loss": 0.1998, "num_input_tokens_seen": 5716672, "step": 29990 }, { "epoch": 15.58991683991684, "grad_norm": 0.3306724429130554, "learning_rate": 7.048673388778196e-06, "loss": 0.0325, "num_input_tokens_seen": 5717696, "step": 29995 }, { "epoch": 15.592515592515593, "grad_norm": 1.0427495241165161, "learning_rate": 7.0407832862129485e-06, "loss": 0.103, "num_input_tokens_seen": 5718624, "step": 30000 }, { "epoch": 15.595114345114345, "grad_norm": 0.6850582957267761, "learning_rate": 7.032896878286541e-06, "loss": 0.0739, "num_input_tokens_seen": 5719616, "step": 30005 }, { "epoch": 15.597713097713097, "grad_norm": 1.5867398977279663, "learning_rate": 7.0250141666214075e-06, "loss": 0.1249, "num_input_tokens_seen": 5720480, "step": 30010 }, { "epoch": 15.60031185031185, "grad_norm": 1.2544653415679932, "learning_rate": 7.017135152839202e-06, "loss": 0.124, "num_input_tokens_seen": 5721440, "step": 30015 }, { "epoch": 15.602910602910603, "grad_norm": 1.5735385417938232, "learning_rate": 7.009259838560822e-06, "loss": 0.269, "num_input_tokens_seen": 5722368, "step": 30020 }, { "epoch": 15.605509355509355, "grad_norm": 0.5325809121131897, "learning_rate": 7.001388225406416e-06, "loss": 0.1456, "num_input_tokens_seen": 5723360, "step": 30025 }, { "epoch": 15.608108108108109, "grad_norm": 1.5056074857711792, "learning_rate": 6.993520314995369e-06, "loss": 0.1093, "num_input_tokens_seen": 5724320, "step": 30030 }, { "epoch": 15.61070686070686, "grad_norm": 0.15296310186386108, "learning_rate": 6.98565610894627e-06, "loss": 0.0421, "num_input_tokens_seen": 5725248, "step": 30035 }, { "epoch": 15.613305613305613, "grad_norm": 0.33048707246780396, "learning_rate": 6.977795608877e-06, "loss": 0.2111, "num_input_tokens_seen": 5726176, "step": 30040 }, { "epoch": 15.615904365904367, "grad_norm": 0.3142857849597931, "learning_rate": 6.969938816404639e-06, "loss": 0.0463, "num_input_tokens_seen": 5727136, "step": 30045 }, { "epoch": 15.618503118503119, "grad_norm": 0.23027034103870392, "learning_rate": 6.962085733145523e-06, "loss": 0.1554, "num_input_tokens_seen": 5728064, "step": 30050 }, { "epoch": 15.621101871101871, "grad_norm": 0.14815251529216766, "learning_rate": 6.954236360715205e-06, "loss": 0.1286, "num_input_tokens_seen": 5728992, "step": 30055 }, { "epoch": 15.623700623700623, "grad_norm": 0.10746323317289352, "learning_rate": 6.946390700728506e-06, "loss": 0.1412, "num_input_tokens_seen": 5729952, "step": 30060 }, { "epoch": 15.626299376299377, "grad_norm": 0.19090865552425385, "learning_rate": 6.938548754799454e-06, "loss": 0.0944, "num_input_tokens_seen": 5730880, "step": 30065 }, { "epoch": 15.628898128898129, "grad_norm": 1.3979997634887695, "learning_rate": 6.930710524541331e-06, "loss": 0.0927, "num_input_tokens_seen": 5731840, "step": 30070 }, { "epoch": 15.631496881496881, "grad_norm": 0.28365013003349304, "learning_rate": 6.922876011566634e-06, "loss": 0.1001, "num_input_tokens_seen": 5732768, "step": 30075 }, { "epoch": 15.634095634095633, "grad_norm": 1.2806525230407715, "learning_rate": 6.915045217487126e-06, "loss": 0.2751, "num_input_tokens_seen": 5733760, "step": 30080 }, { "epoch": 15.636694386694387, "grad_norm": 1.1273881196975708, "learning_rate": 6.9072181439137825e-06, "loss": 0.1128, "num_input_tokens_seen": 5734720, "step": 30085 }, { "epoch": 15.63929313929314, "grad_norm": 0.9296101927757263, "learning_rate": 6.899394792456815e-06, "loss": 0.0884, "num_input_tokens_seen": 5735616, "step": 30090 }, { "epoch": 15.641891891891891, "grad_norm": 0.580790102481842, "learning_rate": 6.891575164725678e-06, "loss": 0.1458, "num_input_tokens_seen": 5736512, "step": 30095 }, { "epoch": 15.644490644490645, "grad_norm": 0.7782406210899353, "learning_rate": 6.883759262329051e-06, "loss": 0.1938, "num_input_tokens_seen": 5737504, "step": 30100 }, { "epoch": 15.647089397089397, "grad_norm": 1.5502291917800903, "learning_rate": 6.875947086874854e-06, "loss": 0.1265, "num_input_tokens_seen": 5738496, "step": 30105 }, { "epoch": 15.64968814968815, "grad_norm": 0.3091242015361786, "learning_rate": 6.868138639970228e-06, "loss": 0.0579, "num_input_tokens_seen": 5739328, "step": 30110 }, { "epoch": 15.652286902286903, "grad_norm": 0.3450397849082947, "learning_rate": 6.860333923221573e-06, "loss": 0.1262, "num_input_tokens_seen": 5740384, "step": 30115 }, { "epoch": 15.654885654885655, "grad_norm": 0.5511334538459778, "learning_rate": 6.852532938234498e-06, "loss": 0.1582, "num_input_tokens_seen": 5741376, "step": 30120 }, { "epoch": 15.657484407484407, "grad_norm": 0.38567259907722473, "learning_rate": 6.844735686613854e-06, "loss": 0.1285, "num_input_tokens_seen": 5742304, "step": 30125 }, { "epoch": 15.66008316008316, "grad_norm": 0.13746462762355804, "learning_rate": 6.836942169963709e-06, "loss": 0.2059, "num_input_tokens_seen": 5743232, "step": 30130 }, { "epoch": 15.662681912681913, "grad_norm": 0.9678397178649902, "learning_rate": 6.829152389887391e-06, "loss": 0.1156, "num_input_tokens_seen": 5744256, "step": 30135 }, { "epoch": 15.665280665280665, "grad_norm": 0.11040589958429337, "learning_rate": 6.82136634798744e-06, "loss": 0.1237, "num_input_tokens_seen": 5745248, "step": 30140 }, { "epoch": 15.667879417879417, "grad_norm": 1.4567482471466064, "learning_rate": 6.813584045865618e-06, "loss": 0.212, "num_input_tokens_seen": 5746240, "step": 30145 }, { "epoch": 15.670478170478171, "grad_norm": 0.8524379730224609, "learning_rate": 6.805805485122949e-06, "loss": 0.2525, "num_input_tokens_seen": 5747200, "step": 30150 }, { "epoch": 15.673076923076923, "grad_norm": 0.42785367369651794, "learning_rate": 6.7980306673596565e-06, "loss": 0.0516, "num_input_tokens_seen": 5748192, "step": 30155 }, { "epoch": 15.675675675675675, "grad_norm": 0.32869186997413635, "learning_rate": 6.790259594175208e-06, "loss": 0.1043, "num_input_tokens_seen": 5749184, "step": 30160 }, { "epoch": 15.678274428274428, "grad_norm": 1.1606954336166382, "learning_rate": 6.7824922671683e-06, "loss": 0.1755, "num_input_tokens_seen": 5750112, "step": 30165 }, { "epoch": 15.680873180873181, "grad_norm": 0.1988266259431839, "learning_rate": 6.774728687936854e-06, "loss": 0.1059, "num_input_tokens_seen": 5751072, "step": 30170 }, { "epoch": 15.683471933471933, "grad_norm": 1.039374589920044, "learning_rate": 6.766968858078026e-06, "loss": 0.112, "num_input_tokens_seen": 5751968, "step": 30175 }, { "epoch": 15.686070686070686, "grad_norm": 2.0218842029571533, "learning_rate": 6.759212779188187e-06, "loss": 0.0998, "num_input_tokens_seen": 5752928, "step": 30180 }, { "epoch": 15.68866943866944, "grad_norm": 1.1361620426177979, "learning_rate": 6.751460452862968e-06, "loss": 0.2286, "num_input_tokens_seen": 5753856, "step": 30185 }, { "epoch": 15.691268191268192, "grad_norm": 1.0883570909500122, "learning_rate": 6.743711880697193e-06, "loss": 0.1335, "num_input_tokens_seen": 5754848, "step": 30190 }, { "epoch": 15.693866943866944, "grad_norm": 0.6977081298828125, "learning_rate": 6.735967064284931e-06, "loss": 0.0975, "num_input_tokens_seen": 5755776, "step": 30195 }, { "epoch": 15.696465696465696, "grad_norm": 0.9170204401016235, "learning_rate": 6.7282260052194675e-06, "loss": 0.0481, "num_input_tokens_seen": 5756768, "step": 30200 }, { "epoch": 15.69906444906445, "grad_norm": 0.6441226005554199, "learning_rate": 6.720488705093339e-06, "loss": 0.162, "num_input_tokens_seen": 5757760, "step": 30205 }, { "epoch": 15.701663201663202, "grad_norm": 1.1697282791137695, "learning_rate": 6.712755165498286e-06, "loss": 0.2537, "num_input_tokens_seen": 5758720, "step": 30210 }, { "epoch": 15.704261954261954, "grad_norm": 0.7852962613105774, "learning_rate": 6.705025388025277e-06, "loss": 0.1222, "num_input_tokens_seen": 5759616, "step": 30215 }, { "epoch": 15.706860706860708, "grad_norm": 1.2187565565109253, "learning_rate": 6.697299374264507e-06, "loss": 0.1442, "num_input_tokens_seen": 5760480, "step": 30220 }, { "epoch": 15.70945945945946, "grad_norm": 0.21515513956546783, "learning_rate": 6.689577125805424e-06, "loss": 0.0906, "num_input_tokens_seen": 5761408, "step": 30225 }, { "epoch": 15.712058212058212, "grad_norm": 0.845546305179596, "learning_rate": 6.681858644236655e-06, "loss": 0.0925, "num_input_tokens_seen": 5762304, "step": 30230 }, { "epoch": 15.714656964656964, "grad_norm": 0.12565511465072632, "learning_rate": 6.674143931146079e-06, "loss": 0.0503, "num_input_tokens_seen": 5763232, "step": 30235 }, { "epoch": 15.717255717255718, "grad_norm": 0.9514118432998657, "learning_rate": 6.666432988120805e-06, "loss": 0.1102, "num_input_tokens_seen": 5764256, "step": 30240 }, { "epoch": 15.71985446985447, "grad_norm": 1.2730772495269775, "learning_rate": 6.658725816747155e-06, "loss": 0.1629, "num_input_tokens_seen": 5765248, "step": 30245 }, { "epoch": 15.722453222453222, "grad_norm": 0.14538443088531494, "learning_rate": 6.651022418610678e-06, "loss": 0.1694, "num_input_tokens_seen": 5766240, "step": 30250 }, { "epoch": 15.725051975051976, "grad_norm": 0.2502521276473999, "learning_rate": 6.6433227952961365e-06, "loss": 0.0783, "num_input_tokens_seen": 5767232, "step": 30255 }, { "epoch": 15.727650727650728, "grad_norm": 0.11112634092569351, "learning_rate": 6.635626948387541e-06, "loss": 0.0629, "num_input_tokens_seen": 5768256, "step": 30260 }, { "epoch": 15.73024948024948, "grad_norm": 0.5504472255706787, "learning_rate": 6.6279348794681065e-06, "loss": 0.2494, "num_input_tokens_seen": 5769216, "step": 30265 }, { "epoch": 15.732848232848234, "grad_norm": 1.6826136112213135, "learning_rate": 6.620246590120263e-06, "loss": 0.2044, "num_input_tokens_seen": 5770208, "step": 30270 }, { "epoch": 15.735446985446986, "grad_norm": 0.684039294719696, "learning_rate": 6.612562081925697e-06, "loss": 0.1483, "num_input_tokens_seen": 5771104, "step": 30275 }, { "epoch": 15.738045738045738, "grad_norm": 0.1257839947938919, "learning_rate": 6.604881356465278e-06, "loss": 0.1157, "num_input_tokens_seen": 5772096, "step": 30280 }, { "epoch": 15.74064449064449, "grad_norm": 1.8151456117630005, "learning_rate": 6.597204415319122e-06, "loss": 0.2735, "num_input_tokens_seen": 5772960, "step": 30285 }, { "epoch": 15.743243243243244, "grad_norm": 1.3567653894424438, "learning_rate": 6.589531260066556e-06, "loss": 0.2128, "num_input_tokens_seen": 5773888, "step": 30290 }, { "epoch": 15.745841995841996, "grad_norm": 0.8483375310897827, "learning_rate": 6.5818618922861284e-06, "loss": 0.1749, "num_input_tokens_seen": 5774912, "step": 30295 }, { "epoch": 15.748440748440748, "grad_norm": 0.669687807559967, "learning_rate": 6.574196313555614e-06, "loss": 0.1539, "num_input_tokens_seen": 5775840, "step": 30300 }, { "epoch": 15.7510395010395, "grad_norm": 0.7414963245391846, "learning_rate": 6.5665345254519975e-06, "loss": 0.0666, "num_input_tokens_seen": 5776832, "step": 30305 }, { "epoch": 15.753638253638254, "grad_norm": 0.49978959560394287, "learning_rate": 6.5588765295515056e-06, "loss": 0.0686, "num_input_tokens_seen": 5777888, "step": 30310 }, { "epoch": 15.756237006237006, "grad_norm": 0.5767985582351685, "learning_rate": 6.551222327429565e-06, "loss": 0.055, "num_input_tokens_seen": 5778880, "step": 30315 }, { "epoch": 15.758835758835758, "grad_norm": 1.8242651224136353, "learning_rate": 6.543571920660821e-06, "loss": 0.1367, "num_input_tokens_seen": 5779808, "step": 30320 }, { "epoch": 15.761434511434512, "grad_norm": 0.4329991042613983, "learning_rate": 6.535925310819146e-06, "loss": 0.1426, "num_input_tokens_seen": 5780704, "step": 30325 }, { "epoch": 15.764033264033264, "grad_norm": 0.5946438908576965, "learning_rate": 6.528282499477637e-06, "loss": 0.1059, "num_input_tokens_seen": 5781568, "step": 30330 }, { "epoch": 15.766632016632016, "grad_norm": 0.5080305337905884, "learning_rate": 6.5206434882086e-06, "loss": 0.0492, "num_input_tokens_seen": 5782464, "step": 30335 }, { "epoch": 15.76923076923077, "grad_norm": 1.557267427444458, "learning_rate": 6.51300827858356e-06, "loss": 0.1616, "num_input_tokens_seen": 5783488, "step": 30340 }, { "epoch": 15.771829521829522, "grad_norm": 0.7994467616081238, "learning_rate": 6.5053768721732525e-06, "loss": 0.1187, "num_input_tokens_seen": 5784512, "step": 30345 }, { "epoch": 15.774428274428274, "grad_norm": 0.5961526036262512, "learning_rate": 6.497749270547659e-06, "loss": 0.1406, "num_input_tokens_seen": 5785440, "step": 30350 }, { "epoch": 15.777027027027026, "grad_norm": 0.6291632652282715, "learning_rate": 6.490125475275954e-06, "loss": 0.1159, "num_input_tokens_seen": 5786368, "step": 30355 }, { "epoch": 15.77962577962578, "grad_norm": 0.8375017642974854, "learning_rate": 6.482505487926513e-06, "loss": 0.1176, "num_input_tokens_seen": 5787360, "step": 30360 }, { "epoch": 15.782224532224532, "grad_norm": 0.8413993716239929, "learning_rate": 6.474889310066972e-06, "loss": 0.1958, "num_input_tokens_seen": 5788288, "step": 30365 }, { "epoch": 15.784823284823284, "grad_norm": 0.6143885254859924, "learning_rate": 6.4672769432641555e-06, "loss": 0.2899, "num_input_tokens_seen": 5789280, "step": 30370 }, { "epoch": 15.787422037422038, "grad_norm": 0.28797563910484314, "learning_rate": 6.459668389084103e-06, "loss": 0.1159, "num_input_tokens_seen": 5790176, "step": 30375 }, { "epoch": 15.79002079002079, "grad_norm": 0.7463968396186829, "learning_rate": 6.452063649092074e-06, "loss": 0.1265, "num_input_tokens_seen": 5791072, "step": 30380 }, { "epoch": 15.792619542619542, "grad_norm": 0.6939801573753357, "learning_rate": 6.444462724852554e-06, "loss": 0.1166, "num_input_tokens_seen": 5792096, "step": 30385 }, { "epoch": 15.795218295218294, "grad_norm": 0.09836248308420181, "learning_rate": 6.43686561792923e-06, "loss": 0.0991, "num_input_tokens_seen": 5793088, "step": 30390 }, { "epoch": 15.797817047817048, "grad_norm": 1.190243124961853, "learning_rate": 6.429272329885e-06, "loss": 0.1202, "num_input_tokens_seen": 5793984, "step": 30395 }, { "epoch": 15.8004158004158, "grad_norm": 1.1659128665924072, "learning_rate": 6.421682862282e-06, "loss": 0.2492, "num_input_tokens_seen": 5795040, "step": 30400 }, { "epoch": 15.803014553014552, "grad_norm": 0.17175762355327606, "learning_rate": 6.414097216681555e-06, "loss": 0.0582, "num_input_tokens_seen": 5795968, "step": 30405 }, { "epoch": 15.805613305613306, "grad_norm": 0.043970637023448944, "learning_rate": 6.406515394644214e-06, "loss": 0.1533, "num_input_tokens_seen": 5796928, "step": 30410 }, { "epoch": 15.808212058212058, "grad_norm": 1.1381890773773193, "learning_rate": 6.3989373977297315e-06, "loss": 0.1587, "num_input_tokens_seen": 5797824, "step": 30415 }, { "epoch": 15.81081081081081, "grad_norm": 0.9626421928405762, "learning_rate": 6.391363227497102e-06, "loss": 0.0868, "num_input_tokens_seen": 5798784, "step": 30420 }, { "epoch": 15.813409563409563, "grad_norm": 0.45514410734176636, "learning_rate": 6.383792885504495e-06, "loss": 0.0895, "num_input_tokens_seen": 5799680, "step": 30425 }, { "epoch": 15.816008316008316, "grad_norm": 0.5949755907058716, "learning_rate": 6.3762263733093075e-06, "loss": 0.0693, "num_input_tokens_seen": 5800640, "step": 30430 }, { "epoch": 15.818607068607069, "grad_norm": 1.1551424264907837, "learning_rate": 6.368663692468166e-06, "loss": 0.2721, "num_input_tokens_seen": 5801664, "step": 30435 }, { "epoch": 15.82120582120582, "grad_norm": 0.052983902394771576, "learning_rate": 6.361104844536886e-06, "loss": 0.0527, "num_input_tokens_seen": 5802560, "step": 30440 }, { "epoch": 15.823804573804575, "grad_norm": 0.12412495911121368, "learning_rate": 6.353549831070505e-06, "loss": 0.1254, "num_input_tokens_seen": 5803520, "step": 30445 }, { "epoch": 15.826403326403327, "grad_norm": 0.9763228297233582, "learning_rate": 6.345998653623259e-06, "loss": 0.1016, "num_input_tokens_seen": 5804448, "step": 30450 }, { "epoch": 15.829002079002079, "grad_norm": 0.28514254093170166, "learning_rate": 6.338451313748622e-06, "loss": 0.1623, "num_input_tokens_seen": 5805376, "step": 30455 }, { "epoch": 15.83160083160083, "grad_norm": 0.8687437772750854, "learning_rate": 6.330907812999256e-06, "loss": 0.0949, "num_input_tokens_seen": 5806272, "step": 30460 }, { "epoch": 15.834199584199585, "grad_norm": 0.6497182250022888, "learning_rate": 6.323368152927037e-06, "loss": 0.0881, "num_input_tokens_seen": 5807232, "step": 30465 }, { "epoch": 15.836798336798337, "grad_norm": 0.2435094118118286, "learning_rate": 6.315832335083044e-06, "loss": 0.0574, "num_input_tokens_seen": 5808192, "step": 30470 }, { "epoch": 15.839397089397089, "grad_norm": 0.3420788645744324, "learning_rate": 6.308300361017588e-06, "loss": 0.0596, "num_input_tokens_seen": 5809152, "step": 30475 }, { "epoch": 15.841995841995843, "grad_norm": 0.34591442346572876, "learning_rate": 6.300772232280172e-06, "loss": 0.1174, "num_input_tokens_seen": 5810080, "step": 30480 }, { "epoch": 15.844594594594595, "grad_norm": 0.8891360759735107, "learning_rate": 6.2932479504195115e-06, "loss": 0.0766, "num_input_tokens_seen": 5811008, "step": 30485 }, { "epoch": 15.847193347193347, "grad_norm": 0.4102691411972046, "learning_rate": 6.285727516983528e-06, "loss": 0.0472, "num_input_tokens_seen": 5812000, "step": 30490 }, { "epoch": 15.8497920997921, "grad_norm": 0.12636961042881012, "learning_rate": 6.278210933519355e-06, "loss": 0.0741, "num_input_tokens_seen": 5812960, "step": 30495 }, { "epoch": 15.852390852390853, "grad_norm": 0.7169199585914612, "learning_rate": 6.27069820157333e-06, "loss": 0.1182, "num_input_tokens_seen": 5813888, "step": 30500 }, { "epoch": 15.854989604989605, "grad_norm": 1.1674573421478271, "learning_rate": 6.263189322690999e-06, "loss": 0.1711, "num_input_tokens_seen": 5814848, "step": 30505 }, { "epoch": 15.857588357588357, "grad_norm": 0.26319628953933716, "learning_rate": 6.255684298417128e-06, "loss": 0.1194, "num_input_tokens_seen": 5815840, "step": 30510 }, { "epoch": 15.86018711018711, "grad_norm": 0.9937357902526855, "learning_rate": 6.248183130295673e-06, "loss": 0.1448, "num_input_tokens_seen": 5816672, "step": 30515 }, { "epoch": 15.862785862785863, "grad_norm": 0.3402453064918518, "learning_rate": 6.240685819869798e-06, "loss": 0.1713, "num_input_tokens_seen": 5817600, "step": 30520 }, { "epoch": 15.865384615384615, "grad_norm": 0.8113294243812561, "learning_rate": 6.23319236868189e-06, "loss": 0.0401, "num_input_tokens_seen": 5818560, "step": 30525 }, { "epoch": 15.867983367983367, "grad_norm": 0.56923508644104, "learning_rate": 6.2257027782735265e-06, "loss": 0.0871, "num_input_tokens_seen": 5819488, "step": 30530 }, { "epoch": 15.870582120582121, "grad_norm": 0.2691762149333954, "learning_rate": 6.218217050185487e-06, "loss": 0.0986, "num_input_tokens_seen": 5820448, "step": 30535 }, { "epoch": 15.873180873180873, "grad_norm": 0.48632100224494934, "learning_rate": 6.210735185957767e-06, "loss": 0.1342, "num_input_tokens_seen": 5821408, "step": 30540 }, { "epoch": 15.875779625779625, "grad_norm": 0.7745938897132874, "learning_rate": 6.203257187129572e-06, "loss": 0.1428, "num_input_tokens_seen": 5822464, "step": 30545 }, { "epoch": 15.878378378378379, "grad_norm": 1.252929925918579, "learning_rate": 6.195783055239307e-06, "loss": 0.1922, "num_input_tokens_seen": 5823392, "step": 30550 }, { "epoch": 15.880977130977131, "grad_norm": 0.5999222993850708, "learning_rate": 6.188312791824558e-06, "loss": 0.0965, "num_input_tokens_seen": 5824352, "step": 30555 }, { "epoch": 15.883575883575883, "grad_norm": 0.18420417606830597, "learning_rate": 6.180846398422158e-06, "loss": 0.225, "num_input_tokens_seen": 5825280, "step": 30560 }, { "epoch": 15.886174636174637, "grad_norm": 1.1697758436203003, "learning_rate": 6.173383876568112e-06, "loss": 0.1562, "num_input_tokens_seen": 5826240, "step": 30565 }, { "epoch": 15.888773388773389, "grad_norm": 0.911547839641571, "learning_rate": 6.165925227797639e-06, "loss": 0.133, "num_input_tokens_seen": 5827200, "step": 30570 }, { "epoch": 15.891372141372141, "grad_norm": 0.35553082823753357, "learning_rate": 6.158470453645157e-06, "loss": 0.1289, "num_input_tokens_seen": 5828192, "step": 30575 }, { "epoch": 15.893970893970893, "grad_norm": 0.7986827492713928, "learning_rate": 6.1510195556443e-06, "loss": 0.1158, "num_input_tokens_seen": 5829120, "step": 30580 }, { "epoch": 15.896569646569647, "grad_norm": 1.2089754343032837, "learning_rate": 6.143572535327893e-06, "loss": 0.0594, "num_input_tokens_seen": 5830048, "step": 30585 }, { "epoch": 15.8991683991684, "grad_norm": 0.854678213596344, "learning_rate": 6.1361293942279655e-06, "loss": 0.1638, "num_input_tokens_seen": 5831040, "step": 30590 }, { "epoch": 15.901767151767151, "grad_norm": 1.4734385013580322, "learning_rate": 6.128690133875736e-06, "loss": 0.1809, "num_input_tokens_seen": 5832032, "step": 30595 }, { "epoch": 15.904365904365905, "grad_norm": 1.5475460290908813, "learning_rate": 6.121254755801659e-06, "loss": 0.0969, "num_input_tokens_seen": 5832960, "step": 30600 }, { "epoch": 15.906964656964657, "grad_norm": 0.29838690161705017, "learning_rate": 6.113823261535356e-06, "loss": 0.1432, "num_input_tokens_seen": 5833856, "step": 30605 }, { "epoch": 15.90956340956341, "grad_norm": 1.038759469985962, "learning_rate": 6.106395652605668e-06, "loss": 0.2329, "num_input_tokens_seen": 5834880, "step": 30610 }, { "epoch": 15.912162162162161, "grad_norm": 1.3460814952850342, "learning_rate": 6.098971930540628e-06, "loss": 0.214, "num_input_tokens_seen": 5835840, "step": 30615 }, { "epoch": 15.914760914760915, "grad_norm": 0.6067748665809631, "learning_rate": 6.091552096867473e-06, "loss": 0.0617, "num_input_tokens_seen": 5836768, "step": 30620 }, { "epoch": 15.917359667359667, "grad_norm": 0.4217059910297394, "learning_rate": 6.084136153112638e-06, "loss": 0.1798, "num_input_tokens_seen": 5837792, "step": 30625 }, { "epoch": 15.91995841995842, "grad_norm": 0.7795775532722473, "learning_rate": 6.0767241008017535e-06, "loss": 0.0899, "num_input_tokens_seen": 5838752, "step": 30630 }, { "epoch": 15.922557172557173, "grad_norm": 0.0892740935087204, "learning_rate": 6.069315941459669e-06, "loss": 0.0398, "num_input_tokens_seen": 5839648, "step": 30635 }, { "epoch": 15.925155925155925, "grad_norm": 0.5179701447486877, "learning_rate": 6.061911676610416e-06, "loss": 0.0747, "num_input_tokens_seen": 5840576, "step": 30640 }, { "epoch": 15.927754677754677, "grad_norm": 0.21818102896213531, "learning_rate": 6.0545113077772136e-06, "loss": 0.0973, "num_input_tokens_seen": 5841536, "step": 30645 }, { "epoch": 15.93035343035343, "grad_norm": 1.311789631843567, "learning_rate": 6.047114836482515e-06, "loss": 0.0752, "num_input_tokens_seen": 5842464, "step": 30650 }, { "epoch": 15.932952182952183, "grad_norm": 0.8330869674682617, "learning_rate": 6.039722264247944e-06, "loss": 0.0701, "num_input_tokens_seen": 5843360, "step": 30655 }, { "epoch": 15.935550935550935, "grad_norm": 0.1957528442144394, "learning_rate": 6.032333592594322e-06, "loss": 0.0729, "num_input_tokens_seen": 5844352, "step": 30660 }, { "epoch": 15.938149688149688, "grad_norm": 0.5540515780448914, "learning_rate": 6.0249488230416735e-06, "loss": 0.1959, "num_input_tokens_seen": 5845344, "step": 30665 }, { "epoch": 15.940748440748441, "grad_norm": 0.5079354643821716, "learning_rate": 6.017567957109235e-06, "loss": 0.1501, "num_input_tokens_seen": 5846272, "step": 30670 }, { "epoch": 15.943347193347194, "grad_norm": 0.17341573536396027, "learning_rate": 6.010190996315418e-06, "loss": 0.1118, "num_input_tokens_seen": 5847232, "step": 30675 }, { "epoch": 15.945945945945946, "grad_norm": 0.15771649777889252, "learning_rate": 6.002817942177838e-06, "loss": 0.1234, "num_input_tokens_seen": 5848160, "step": 30680 }, { "epoch": 15.948544698544698, "grad_norm": 0.24013102054595947, "learning_rate": 5.99544879621331e-06, "loss": 0.0755, "num_input_tokens_seen": 5849120, "step": 30685 }, { "epoch": 15.951143451143452, "grad_norm": 1.108269214630127, "learning_rate": 5.988083559937843e-06, "loss": 0.1612, "num_input_tokens_seen": 5850016, "step": 30690 }, { "epoch": 15.953742203742204, "grad_norm": 0.5605195760726929, "learning_rate": 5.980722234866643e-06, "loss": 0.1529, "num_input_tokens_seen": 5850944, "step": 30695 }, { "epoch": 15.956340956340956, "grad_norm": 0.09202852845191956, "learning_rate": 5.973364822514099e-06, "loss": 0.1853, "num_input_tokens_seen": 5851904, "step": 30700 }, { "epoch": 15.95893970893971, "grad_norm": 0.47779905796051025, "learning_rate": 5.966011324393822e-06, "loss": 0.1396, "num_input_tokens_seen": 5852864, "step": 30705 }, { "epoch": 15.961538461538462, "grad_norm": 0.13828113675117493, "learning_rate": 5.9586617420185955e-06, "loss": 0.112, "num_input_tokens_seen": 5853856, "step": 30710 }, { "epoch": 15.964137214137214, "grad_norm": 0.3153620660305023, "learning_rate": 5.951316076900404e-06, "loss": 0.1188, "num_input_tokens_seen": 5854848, "step": 30715 }, { "epoch": 15.966735966735968, "grad_norm": 1.3743170499801636, "learning_rate": 5.943974330550417e-06, "loss": 0.2315, "num_input_tokens_seen": 5855808, "step": 30720 }, { "epoch": 15.96933471933472, "grad_norm": 0.4244115948677063, "learning_rate": 5.93663650447902e-06, "loss": 0.1857, "num_input_tokens_seen": 5856736, "step": 30725 }, { "epoch": 15.971933471933472, "grad_norm": 0.1908334344625473, "learning_rate": 5.929302600195777e-06, "loss": 0.0664, "num_input_tokens_seen": 5857664, "step": 30730 }, { "epoch": 15.974532224532224, "grad_norm": 1.214031457901001, "learning_rate": 5.9219726192094426e-06, "loss": 0.1826, "num_input_tokens_seen": 5858656, "step": 30735 }, { "epoch": 15.977130977130978, "grad_norm": 0.9784892201423645, "learning_rate": 5.914646563027962e-06, "loss": 0.2507, "num_input_tokens_seen": 5859616, "step": 30740 }, { "epoch": 15.97972972972973, "grad_norm": 0.38290461897850037, "learning_rate": 5.907324433158493e-06, "loss": 0.0783, "num_input_tokens_seen": 5860544, "step": 30745 }, { "epoch": 15.982328482328482, "grad_norm": 0.14002206921577454, "learning_rate": 5.900006231107372e-06, "loss": 0.12, "num_input_tokens_seen": 5861504, "step": 30750 }, { "epoch": 15.984927234927234, "grad_norm": 0.40151283144950867, "learning_rate": 5.892691958380109e-06, "loss": 0.1484, "num_input_tokens_seen": 5862432, "step": 30755 }, { "epoch": 15.987525987525988, "grad_norm": 0.9179131984710693, "learning_rate": 5.885381616481445e-06, "loss": 0.0943, "num_input_tokens_seen": 5863392, "step": 30760 }, { "epoch": 15.99012474012474, "grad_norm": 0.33520495891571045, "learning_rate": 5.878075206915279e-06, "loss": 0.1213, "num_input_tokens_seen": 5864320, "step": 30765 }, { "epoch": 15.992723492723492, "grad_norm": 0.95445317029953, "learning_rate": 5.870772731184715e-06, "loss": 0.1294, "num_input_tokens_seen": 5865312, "step": 30770 }, { "epoch": 15.995322245322246, "grad_norm": 0.8859032988548279, "learning_rate": 5.863474190792053e-06, "loss": 0.1907, "num_input_tokens_seen": 5866240, "step": 30775 }, { "epoch": 15.997920997920998, "grad_norm": 0.4459361135959625, "learning_rate": 5.856179587238775e-06, "loss": 0.1746, "num_input_tokens_seen": 5867264, "step": 30780 }, { "epoch": 16.0, "eval_loss": 0.1468277871608734, "eval_runtime": 8.0254, "eval_samples_per_second": 106.661, "eval_steps_per_second": 26.665, "num_input_tokens_seen": 5867920, "step": 30784 }, { "epoch": 16.00051975051975, "grad_norm": 0.5517723560333252, "learning_rate": 5.848888922025553e-06, "loss": 0.1083, "num_input_tokens_seen": 5868112, "step": 30785 }, { "epoch": 16.003118503118504, "grad_norm": 0.24829667806625366, "learning_rate": 5.841602196652241e-06, "loss": 0.0642, "num_input_tokens_seen": 5869136, "step": 30790 }, { "epoch": 16.005717255717254, "grad_norm": 0.30624309182167053, "learning_rate": 5.834319412617911e-06, "loss": 0.1117, "num_input_tokens_seen": 5870032, "step": 30795 }, { "epoch": 16.008316008316008, "grad_norm": 1.5394797325134277, "learning_rate": 5.827040571420792e-06, "loss": 0.0992, "num_input_tokens_seen": 5870928, "step": 30800 }, { "epoch": 16.010914760914762, "grad_norm": 0.4734882712364197, "learning_rate": 5.819765674558322e-06, "loss": 0.0675, "num_input_tokens_seen": 5871920, "step": 30805 }, { "epoch": 16.013513513513512, "grad_norm": 0.29270756244659424, "learning_rate": 5.812494723527106e-06, "loss": 0.1169, "num_input_tokens_seen": 5872880, "step": 30810 }, { "epoch": 16.016112266112266, "grad_norm": 0.8079338669776917, "learning_rate": 5.805227719822981e-06, "loss": 0.1365, "num_input_tokens_seen": 5873840, "step": 30815 }, { "epoch": 16.01871101871102, "grad_norm": 0.971403956413269, "learning_rate": 5.797964664940916e-06, "loss": 0.1738, "num_input_tokens_seen": 5874896, "step": 30820 }, { "epoch": 16.02130977130977, "grad_norm": 1.177677035331726, "learning_rate": 5.7907055603750985e-06, "loss": 0.1158, "num_input_tokens_seen": 5875856, "step": 30825 }, { "epoch": 16.023908523908524, "grad_norm": 1.4588929414749146, "learning_rate": 5.783450407618907e-06, "loss": 0.1672, "num_input_tokens_seen": 5876784, "step": 30830 }, { "epoch": 16.026507276507278, "grad_norm": 1.1085705757141113, "learning_rate": 5.776199208164901e-06, "loss": 0.0953, "num_input_tokens_seen": 5877712, "step": 30835 }, { "epoch": 16.02910602910603, "grad_norm": 1.4081406593322754, "learning_rate": 5.768951963504818e-06, "loss": 0.1479, "num_input_tokens_seen": 5878640, "step": 30840 }, { "epoch": 16.031704781704782, "grad_norm": 0.2471456676721573, "learning_rate": 5.761708675129585e-06, "loss": 0.1151, "num_input_tokens_seen": 5879600, "step": 30845 }, { "epoch": 16.034303534303536, "grad_norm": 0.7136829495429993, "learning_rate": 5.7544693445293315e-06, "loss": 0.1228, "num_input_tokens_seen": 5880592, "step": 30850 }, { "epoch": 16.036902286902286, "grad_norm": 0.7808423042297363, "learning_rate": 5.747233973193353e-06, "loss": 0.1197, "num_input_tokens_seen": 5881488, "step": 30855 }, { "epoch": 16.03950103950104, "grad_norm": 0.35052868723869324, "learning_rate": 5.740002562610141e-06, "loss": 0.1011, "num_input_tokens_seen": 5882416, "step": 30860 }, { "epoch": 16.04209979209979, "grad_norm": 0.4505654275417328, "learning_rate": 5.7327751142673606e-06, "loss": 0.1755, "num_input_tokens_seen": 5883312, "step": 30865 }, { "epoch": 16.044698544698544, "grad_norm": 0.9515411257743835, "learning_rate": 5.725551629651882e-06, "loss": 0.1277, "num_input_tokens_seen": 5884208, "step": 30870 }, { "epoch": 16.0472972972973, "grad_norm": 0.19794853031635284, "learning_rate": 5.718332110249744e-06, "loss": 0.0563, "num_input_tokens_seen": 5885040, "step": 30875 }, { "epoch": 16.04989604989605, "grad_norm": 0.339881032705307, "learning_rate": 5.711116557546173e-06, "loss": 0.142, "num_input_tokens_seen": 5886000, "step": 30880 }, { "epoch": 16.052494802494802, "grad_norm": 0.9601635932922363, "learning_rate": 5.7039049730255786e-06, "loss": 0.101, "num_input_tokens_seen": 5886928, "step": 30885 }, { "epoch": 16.055093555093556, "grad_norm": 1.0324180126190186, "learning_rate": 5.696697358171557e-06, "loss": 0.048, "num_input_tokens_seen": 5887760, "step": 30890 }, { "epoch": 16.057692307692307, "grad_norm": 0.7525599598884583, "learning_rate": 5.689493714466879e-06, "loss": 0.0709, "num_input_tokens_seen": 5888784, "step": 30895 }, { "epoch": 16.06029106029106, "grad_norm": 0.6891484260559082, "learning_rate": 5.682294043393518e-06, "loss": 0.0612, "num_input_tokens_seen": 5889680, "step": 30900 }, { "epoch": 16.062889812889814, "grad_norm": 0.7248923778533936, "learning_rate": 5.675098346432614e-06, "loss": 0.0889, "num_input_tokens_seen": 5890640, "step": 30905 }, { "epoch": 16.065488565488565, "grad_norm": 1.2923500537872314, "learning_rate": 5.6679066250644924e-06, "loss": 0.0685, "num_input_tokens_seen": 5891568, "step": 30910 }, { "epoch": 16.06808731808732, "grad_norm": 0.5400479435920715, "learning_rate": 5.660718880768653e-06, "loss": 0.0611, "num_input_tokens_seen": 5892528, "step": 30915 }, { "epoch": 16.070686070686072, "grad_norm": 0.9423142075538635, "learning_rate": 5.653535115023803e-06, "loss": 0.089, "num_input_tokens_seen": 5893424, "step": 30920 }, { "epoch": 16.073284823284823, "grad_norm": 1.694387435913086, "learning_rate": 5.646355329307807e-06, "loss": 0.1713, "num_input_tokens_seen": 5894352, "step": 30925 }, { "epoch": 16.075883575883577, "grad_norm": 0.6631230115890503, "learning_rate": 5.639179525097712e-06, "loss": 0.1842, "num_input_tokens_seen": 5895248, "step": 30930 }, { "epoch": 16.078482328482327, "grad_norm": 0.4109783470630646, "learning_rate": 5.632007703869751e-06, "loss": 0.0951, "num_input_tokens_seen": 5896208, "step": 30935 }, { "epoch": 16.08108108108108, "grad_norm": 0.19563819468021393, "learning_rate": 5.624839867099352e-06, "loss": 0.0652, "num_input_tokens_seen": 5897104, "step": 30940 }, { "epoch": 16.083679833679835, "grad_norm": 0.13711456954479218, "learning_rate": 5.617676016261109e-06, "loss": 0.0797, "num_input_tokens_seen": 5898064, "step": 30945 }, { "epoch": 16.086278586278585, "grad_norm": 0.43344730138778687, "learning_rate": 5.610516152828776e-06, "loss": 0.09, "num_input_tokens_seen": 5899056, "step": 30950 }, { "epoch": 16.08887733887734, "grad_norm": 0.5927060842514038, "learning_rate": 5.603360278275327e-06, "loss": 0.0993, "num_input_tokens_seen": 5899984, "step": 30955 }, { "epoch": 16.091476091476093, "grad_norm": 0.5305303931236267, "learning_rate": 5.596208394072891e-06, "loss": 0.1109, "num_input_tokens_seen": 5900944, "step": 30960 }, { "epoch": 16.094074844074843, "grad_norm": 0.7501252889633179, "learning_rate": 5.589060501692777e-06, "loss": 0.2016, "num_input_tokens_seen": 5901840, "step": 30965 }, { "epoch": 16.096673596673597, "grad_norm": 0.1113305315375328, "learning_rate": 5.5819166026054775e-06, "loss": 0.106, "num_input_tokens_seen": 5902864, "step": 30970 }, { "epoch": 16.09927234927235, "grad_norm": 0.23762662708759308, "learning_rate": 5.574776698280671e-06, "loss": 0.0705, "num_input_tokens_seen": 5903824, "step": 30975 }, { "epoch": 16.1018711018711, "grad_norm": 0.8007466197013855, "learning_rate": 5.567640790187198e-06, "loss": 0.1386, "num_input_tokens_seen": 5904848, "step": 30980 }, { "epoch": 16.104469854469855, "grad_norm": 0.17652252316474915, "learning_rate": 5.560508879793089e-06, "loss": 0.1007, "num_input_tokens_seen": 5905808, "step": 30985 }, { "epoch": 16.10706860706861, "grad_norm": 1.4760029315948486, "learning_rate": 5.553380968565539e-06, "loss": 0.1482, "num_input_tokens_seen": 5906704, "step": 30990 }, { "epoch": 16.10966735966736, "grad_norm": 1.1474292278289795, "learning_rate": 5.546257057970941e-06, "loss": 0.164, "num_input_tokens_seen": 5907664, "step": 30995 }, { "epoch": 16.112266112266113, "grad_norm": 1.2512927055358887, "learning_rate": 5.539137149474849e-06, "loss": 0.167, "num_input_tokens_seen": 5908592, "step": 31000 }, { "epoch": 16.114864864864863, "grad_norm": 1.203719139099121, "learning_rate": 5.53202124454199e-06, "loss": 0.1684, "num_input_tokens_seen": 5909520, "step": 31005 }, { "epoch": 16.117463617463617, "grad_norm": 0.5929862856864929, "learning_rate": 5.524909344636297e-06, "loss": 0.0459, "num_input_tokens_seen": 5910512, "step": 31010 }, { "epoch": 16.12006237006237, "grad_norm": 1.2280699014663696, "learning_rate": 5.517801451220836e-06, "loss": 0.0977, "num_input_tokens_seen": 5911504, "step": 31015 }, { "epoch": 16.12266112266112, "grad_norm": 0.9851067066192627, "learning_rate": 5.510697565757869e-06, "loss": 0.2153, "num_input_tokens_seen": 5912496, "step": 31020 }, { "epoch": 16.125259875259875, "grad_norm": 0.8417218327522278, "learning_rate": 5.50359768970885e-06, "loss": 0.1256, "num_input_tokens_seen": 5913488, "step": 31025 }, { "epoch": 16.12785862785863, "grad_norm": 0.11111194640398026, "learning_rate": 5.496501824534386e-06, "loss": 0.1299, "num_input_tokens_seen": 5914448, "step": 31030 }, { "epoch": 16.13045738045738, "grad_norm": 0.6546400189399719, "learning_rate": 5.489409971694262e-06, "loss": 0.1028, "num_input_tokens_seen": 5915440, "step": 31035 }, { "epoch": 16.133056133056133, "grad_norm": 0.5307150483131409, "learning_rate": 5.482322132647438e-06, "loss": 0.0944, "num_input_tokens_seen": 5916368, "step": 31040 }, { "epoch": 16.135654885654887, "grad_norm": 1.3377667665481567, "learning_rate": 5.475238308852063e-06, "loss": 0.141, "num_input_tokens_seen": 5917328, "step": 31045 }, { "epoch": 16.138253638253637, "grad_norm": 0.30135658383369446, "learning_rate": 5.468158501765444e-06, "loss": 0.1064, "num_input_tokens_seen": 5918320, "step": 31050 }, { "epoch": 16.14085239085239, "grad_norm": 0.19060060381889343, "learning_rate": 5.4610827128440625e-06, "loss": 0.0742, "num_input_tokens_seen": 5919280, "step": 31055 }, { "epoch": 16.143451143451145, "grad_norm": 1.1772727966308594, "learning_rate": 5.454010943543572e-06, "loss": 0.0906, "num_input_tokens_seen": 5920176, "step": 31060 }, { "epoch": 16.146049896049895, "grad_norm": 0.20821326971054077, "learning_rate": 5.4469431953188185e-06, "loss": 0.118, "num_input_tokens_seen": 5921072, "step": 31065 }, { "epoch": 16.14864864864865, "grad_norm": 0.7118170261383057, "learning_rate": 5.4398794696237984e-06, "loss": 0.089, "num_input_tokens_seen": 5922064, "step": 31070 }, { "epoch": 16.151247401247403, "grad_norm": 0.16629809141159058, "learning_rate": 5.43281976791169e-06, "loss": 0.0869, "num_input_tokens_seen": 5923024, "step": 31075 }, { "epoch": 16.153846153846153, "grad_norm": 1.0489940643310547, "learning_rate": 5.4257640916348386e-06, "loss": 0.0635, "num_input_tokens_seen": 5923952, "step": 31080 }, { "epoch": 16.156444906444907, "grad_norm": 0.5816037058830261, "learning_rate": 5.418712442244767e-06, "loss": 0.0948, "num_input_tokens_seen": 5924944, "step": 31085 }, { "epoch": 16.159043659043657, "grad_norm": 2.860443115234375, "learning_rate": 5.411664821192169e-06, "loss": 0.2837, "num_input_tokens_seen": 5925872, "step": 31090 }, { "epoch": 16.16164241164241, "grad_norm": 1.0406538248062134, "learning_rate": 5.4046212299268985e-06, "loss": 0.1327, "num_input_tokens_seen": 5926832, "step": 31095 }, { "epoch": 16.164241164241165, "grad_norm": 0.9711918830871582, "learning_rate": 5.397581669898005e-06, "loss": 0.1537, "num_input_tokens_seen": 5927792, "step": 31100 }, { "epoch": 16.166839916839916, "grad_norm": 0.4757426381111145, "learning_rate": 5.390546142553687e-06, "loss": 0.1607, "num_input_tokens_seen": 5928752, "step": 31105 }, { "epoch": 16.16943866943867, "grad_norm": 0.3375747501850128, "learning_rate": 5.38351464934132e-06, "loss": 0.0445, "num_input_tokens_seen": 5929616, "step": 31110 }, { "epoch": 16.172037422037423, "grad_norm": 0.9690465331077576, "learning_rate": 5.3764871917074435e-06, "loss": 0.1523, "num_input_tokens_seen": 5930640, "step": 31115 }, { "epoch": 16.174636174636174, "grad_norm": 0.8471152186393738, "learning_rate": 5.369463771097788e-06, "loss": 0.1788, "num_input_tokens_seen": 5931632, "step": 31120 }, { "epoch": 16.177234927234927, "grad_norm": 0.40006664395332336, "learning_rate": 5.362444388957227e-06, "loss": 0.075, "num_input_tokens_seen": 5932592, "step": 31125 }, { "epoch": 16.17983367983368, "grad_norm": 0.5184862017631531, "learning_rate": 5.355429046729812e-06, "loss": 0.1173, "num_input_tokens_seen": 5933552, "step": 31130 }, { "epoch": 16.18243243243243, "grad_norm": 0.7495272159576416, "learning_rate": 5.34841774585878e-06, "loss": 0.1838, "num_input_tokens_seen": 5934512, "step": 31135 }, { "epoch": 16.185031185031185, "grad_norm": 0.712189257144928, "learning_rate": 5.341410487786519e-06, "loss": 0.0616, "num_input_tokens_seen": 5935440, "step": 31140 }, { "epoch": 16.18762993762994, "grad_norm": 0.3079983592033386, "learning_rate": 5.334407273954573e-06, "loss": 0.1106, "num_input_tokens_seen": 5936368, "step": 31145 }, { "epoch": 16.19022869022869, "grad_norm": 0.2807735204696655, "learning_rate": 5.327408105803685e-06, "loss": 0.1325, "num_input_tokens_seen": 5937360, "step": 31150 }, { "epoch": 16.192827442827443, "grad_norm": 0.082057423889637, "learning_rate": 5.320412984773748e-06, "loss": 0.1677, "num_input_tokens_seen": 5938320, "step": 31155 }, { "epoch": 16.195426195426194, "grad_norm": 1.4521963596343994, "learning_rate": 5.313421912303826e-06, "loss": 0.0807, "num_input_tokens_seen": 5939344, "step": 31160 }, { "epoch": 16.198024948024948, "grad_norm": 0.8759795427322388, "learning_rate": 5.306434889832137e-06, "loss": 0.1288, "num_input_tokens_seen": 5940272, "step": 31165 }, { "epoch": 16.2006237006237, "grad_norm": 0.1516459435224533, "learning_rate": 5.299451918796098e-06, "loss": 0.1414, "num_input_tokens_seen": 5941232, "step": 31170 }, { "epoch": 16.203222453222452, "grad_norm": 0.8083745241165161, "learning_rate": 5.292473000632262e-06, "loss": 0.1416, "num_input_tokens_seen": 5942160, "step": 31175 }, { "epoch": 16.205821205821206, "grad_norm": 0.7749733924865723, "learning_rate": 5.285498136776357e-06, "loss": 0.1054, "num_input_tokens_seen": 5943120, "step": 31180 }, { "epoch": 16.20841995841996, "grad_norm": 1.2338570356369019, "learning_rate": 5.278527328663277e-06, "loss": 0.2564, "num_input_tokens_seen": 5944176, "step": 31185 }, { "epoch": 16.21101871101871, "grad_norm": 0.34450817108154297, "learning_rate": 5.271560577727094e-06, "loss": 0.0862, "num_input_tokens_seen": 5945072, "step": 31190 }, { "epoch": 16.213617463617464, "grad_norm": 0.6724627614021301, "learning_rate": 5.264597885401029e-06, "loss": 0.0753, "num_input_tokens_seen": 5946160, "step": 31195 }, { "epoch": 16.216216216216218, "grad_norm": 0.09972194582223892, "learning_rate": 5.257639253117472e-06, "loss": 0.1401, "num_input_tokens_seen": 5947056, "step": 31200 }, { "epoch": 16.218814968814968, "grad_norm": 0.3353639543056488, "learning_rate": 5.25068468230798e-06, "loss": 0.1292, "num_input_tokens_seen": 5948048, "step": 31205 }, { "epoch": 16.22141372141372, "grad_norm": 0.15564243495464325, "learning_rate": 5.243734174403275e-06, "loss": 0.093, "num_input_tokens_seen": 5949040, "step": 31210 }, { "epoch": 16.224012474012476, "grad_norm": 0.7027372717857361, "learning_rate": 5.236787730833242e-06, "loss": 0.2775, "num_input_tokens_seen": 5950032, "step": 31215 }, { "epoch": 16.226611226611226, "grad_norm": 1.257400631904602, "learning_rate": 5.229845353026921e-06, "loss": 0.137, "num_input_tokens_seen": 5951024, "step": 31220 }, { "epoch": 16.22920997920998, "grad_norm": 0.043212492018938065, "learning_rate": 5.222907042412542e-06, "loss": 0.0503, "num_input_tokens_seen": 5951984, "step": 31225 }, { "epoch": 16.23180873180873, "grad_norm": 0.3665895164012909, "learning_rate": 5.2159728004174705e-06, "loss": 0.1135, "num_input_tokens_seen": 5952976, "step": 31230 }, { "epoch": 16.234407484407484, "grad_norm": 0.3806682527065277, "learning_rate": 5.209042628468247e-06, "loss": 0.092, "num_input_tokens_seen": 5953904, "step": 31235 }, { "epoch": 16.237006237006238, "grad_norm": 1.2059319019317627, "learning_rate": 5.202116527990566e-06, "loss": 0.1144, "num_input_tokens_seen": 5954768, "step": 31240 }, { "epoch": 16.239604989604988, "grad_norm": 1.6249077320098877, "learning_rate": 5.1951945004093064e-06, "loss": 0.2889, "num_input_tokens_seen": 5955728, "step": 31245 }, { "epoch": 16.242203742203742, "grad_norm": 1.0192972421646118, "learning_rate": 5.188276547148485e-06, "loss": 0.0807, "num_input_tokens_seen": 5956720, "step": 31250 }, { "epoch": 16.244802494802496, "grad_norm": 0.9658232927322388, "learning_rate": 5.181362669631284e-06, "loss": 0.1202, "num_input_tokens_seen": 5957680, "step": 31255 }, { "epoch": 16.247401247401246, "grad_norm": 0.1276198774576187, "learning_rate": 5.1744528692800666e-06, "loss": 0.082, "num_input_tokens_seen": 5958608, "step": 31260 }, { "epoch": 16.25, "grad_norm": 1.067440390586853, "learning_rate": 5.167547147516335e-06, "loss": 0.243, "num_input_tokens_seen": 5959536, "step": 31265 }, { "epoch": 16.252598752598754, "grad_norm": 0.14302895963191986, "learning_rate": 5.160645505760761e-06, "loss": 0.1524, "num_input_tokens_seen": 5960528, "step": 31270 }, { "epoch": 16.255197505197504, "grad_norm": 0.43741241097450256, "learning_rate": 5.1537479454331795e-06, "loss": 0.1825, "num_input_tokens_seen": 5961520, "step": 31275 }, { "epoch": 16.257796257796258, "grad_norm": 0.19205980002880096, "learning_rate": 5.146854467952581e-06, "loss": 0.0922, "num_input_tokens_seen": 5962544, "step": 31280 }, { "epoch": 16.260395010395012, "grad_norm": 0.6987667679786682, "learning_rate": 5.139965074737119e-06, "loss": 0.1056, "num_input_tokens_seen": 5963440, "step": 31285 }, { "epoch": 16.262993762993762, "grad_norm": 0.9062141180038452, "learning_rate": 5.133079767204097e-06, "loss": 0.1371, "num_input_tokens_seen": 5964368, "step": 31290 }, { "epoch": 16.265592515592516, "grad_norm": 0.5370936393737793, "learning_rate": 5.126198546770003e-06, "loss": 0.0887, "num_input_tokens_seen": 5965328, "step": 31295 }, { "epoch": 16.26819126819127, "grad_norm": 0.5154812932014465, "learning_rate": 5.119321414850459e-06, "loss": 0.1259, "num_input_tokens_seen": 5966224, "step": 31300 }, { "epoch": 16.27079002079002, "grad_norm": 0.8889248371124268, "learning_rate": 5.112448372860257e-06, "loss": 0.0524, "num_input_tokens_seen": 5967184, "step": 31305 }, { "epoch": 16.273388773388774, "grad_norm": 0.12794579565525055, "learning_rate": 5.105579422213335e-06, "loss": 0.0953, "num_input_tokens_seen": 5968144, "step": 31310 }, { "epoch": 16.275987525987524, "grad_norm": 0.8763397336006165, "learning_rate": 5.098714564322818e-06, "loss": 0.0511, "num_input_tokens_seen": 5969136, "step": 31315 }, { "epoch": 16.27858627858628, "grad_norm": 1.220270037651062, "learning_rate": 5.091853800600963e-06, "loss": 0.0997, "num_input_tokens_seen": 5970064, "step": 31320 }, { "epoch": 16.281185031185032, "grad_norm": 0.16453464329242706, "learning_rate": 5.084997132459191e-06, "loss": 0.1263, "num_input_tokens_seen": 5970992, "step": 31325 }, { "epoch": 16.283783783783782, "grad_norm": 1.3109636306762695, "learning_rate": 5.078144561308077e-06, "loss": 0.2487, "num_input_tokens_seen": 5972016, "step": 31330 }, { "epoch": 16.286382536382536, "grad_norm": 0.38725191354751587, "learning_rate": 5.071296088557376e-06, "loss": 0.0583, "num_input_tokens_seen": 5972912, "step": 31335 }, { "epoch": 16.28898128898129, "grad_norm": 0.16109849512577057, "learning_rate": 5.064451715615961e-06, "loss": 0.1284, "num_input_tokens_seen": 5973872, "step": 31340 }, { "epoch": 16.29158004158004, "grad_norm": 0.23128721117973328, "learning_rate": 5.057611443891888e-06, "loss": 0.03, "num_input_tokens_seen": 5974768, "step": 31345 }, { "epoch": 16.294178794178794, "grad_norm": 0.46596553921699524, "learning_rate": 5.050775274792375e-06, "loss": 0.1013, "num_input_tokens_seen": 5975728, "step": 31350 }, { "epoch": 16.296777546777548, "grad_norm": 0.6006269454956055, "learning_rate": 5.043943209723776e-06, "loss": 0.1691, "num_input_tokens_seen": 5976688, "step": 31355 }, { "epoch": 16.2993762993763, "grad_norm": 0.3224141001701355, "learning_rate": 5.037115250091612e-06, "loss": 0.1165, "num_input_tokens_seen": 5977680, "step": 31360 }, { "epoch": 16.301975051975052, "grad_norm": 0.35483625531196594, "learning_rate": 5.030291397300546e-06, "loss": 0.1203, "num_input_tokens_seen": 5978608, "step": 31365 }, { "epoch": 16.304573804573806, "grad_norm": 1.3242992162704468, "learning_rate": 5.023471652754427e-06, "loss": 0.1996, "num_input_tokens_seen": 5979504, "step": 31370 }, { "epoch": 16.307172557172557, "grad_norm": 0.7371925711631775, "learning_rate": 5.016656017856228e-06, "loss": 0.1077, "num_input_tokens_seen": 5980464, "step": 31375 }, { "epoch": 16.30977130977131, "grad_norm": 0.9028890132904053, "learning_rate": 5.009844494008084e-06, "loss": 0.104, "num_input_tokens_seen": 5981456, "step": 31380 }, { "epoch": 16.31237006237006, "grad_norm": 0.6333000659942627, "learning_rate": 5.003037082611295e-06, "loss": 0.1912, "num_input_tokens_seen": 5982384, "step": 31385 }, { "epoch": 16.314968814968815, "grad_norm": 0.5820915699005127, "learning_rate": 4.99623378506631e-06, "loss": 0.0861, "num_input_tokens_seen": 5983376, "step": 31390 }, { "epoch": 16.31756756756757, "grad_norm": 0.4500606060028076, "learning_rate": 4.989434602772722e-06, "loss": 0.0807, "num_input_tokens_seen": 5984336, "step": 31395 }, { "epoch": 16.32016632016632, "grad_norm": 0.1507081687450409, "learning_rate": 4.982639537129285e-06, "loss": 0.073, "num_input_tokens_seen": 5985232, "step": 31400 }, { "epoch": 16.322765072765073, "grad_norm": 1.4511510133743286, "learning_rate": 4.975848589533913e-06, "loss": 0.1326, "num_input_tokens_seen": 5986160, "step": 31405 }, { "epoch": 16.325363825363826, "grad_norm": 0.11422190070152283, "learning_rate": 4.969061761383659e-06, "loss": 0.0695, "num_input_tokens_seen": 5987120, "step": 31410 }, { "epoch": 16.327962577962577, "grad_norm": 0.1785142719745636, "learning_rate": 4.962279054074728e-06, "loss": 0.0767, "num_input_tokens_seen": 5988016, "step": 31415 }, { "epoch": 16.33056133056133, "grad_norm": 0.18237672746181488, "learning_rate": 4.955500469002506e-06, "loss": 0.0661, "num_input_tokens_seen": 5988976, "step": 31420 }, { "epoch": 16.333160083160084, "grad_norm": 1.268658995628357, "learning_rate": 4.948726007561494e-06, "loss": 0.2745, "num_input_tokens_seen": 5989936, "step": 31425 }, { "epoch": 16.335758835758835, "grad_norm": 0.27953335642814636, "learning_rate": 4.941955671145363e-06, "loss": 0.043, "num_input_tokens_seen": 5990864, "step": 31430 }, { "epoch": 16.33835758835759, "grad_norm": 1.2199792861938477, "learning_rate": 4.935189461146928e-06, "loss": 0.1623, "num_input_tokens_seen": 5991792, "step": 31435 }, { "epoch": 16.340956340956343, "grad_norm": 0.9518185257911682, "learning_rate": 4.92842737895817e-06, "loss": 0.0809, "num_input_tokens_seen": 5992656, "step": 31440 }, { "epoch": 16.343555093555093, "grad_norm": 0.6712750792503357, "learning_rate": 4.9216694259702044e-06, "loss": 0.1109, "num_input_tokens_seen": 5993616, "step": 31445 }, { "epoch": 16.346153846153847, "grad_norm": 1.4388575553894043, "learning_rate": 4.914915603573306e-06, "loss": 0.2286, "num_input_tokens_seen": 5994608, "step": 31450 }, { "epoch": 16.348752598752597, "grad_norm": 0.48755472898483276, "learning_rate": 4.908165913156887e-06, "loss": 0.1242, "num_input_tokens_seen": 5995536, "step": 31455 }, { "epoch": 16.35135135135135, "grad_norm": 1.2737430334091187, "learning_rate": 4.901420356109535e-06, "loss": 0.1089, "num_input_tokens_seen": 5996464, "step": 31460 }, { "epoch": 16.353950103950105, "grad_norm": 1.3541210889816284, "learning_rate": 4.8946789338189695e-06, "loss": 0.0812, "num_input_tokens_seen": 5997424, "step": 31465 }, { "epoch": 16.356548856548855, "grad_norm": 1.1957848072052002, "learning_rate": 4.8879416476720455e-06, "loss": 0.0701, "num_input_tokens_seen": 5998352, "step": 31470 }, { "epoch": 16.35914760914761, "grad_norm": 0.16972671449184418, "learning_rate": 4.881208499054798e-06, "loss": 0.1637, "num_input_tokens_seen": 5999312, "step": 31475 }, { "epoch": 16.361746361746363, "grad_norm": 0.18792720139026642, "learning_rate": 4.874479489352396e-06, "loss": 0.065, "num_input_tokens_seen": 6000368, "step": 31480 }, { "epoch": 16.364345114345113, "grad_norm": 1.2369117736816406, "learning_rate": 4.8677546199491496e-06, "loss": 0.1433, "num_input_tokens_seen": 6001328, "step": 31485 }, { "epoch": 16.366943866943867, "grad_norm": 0.303234338760376, "learning_rate": 4.8610338922285255e-06, "loss": 0.06, "num_input_tokens_seen": 6002320, "step": 31490 }, { "epoch": 16.36954261954262, "grad_norm": 0.4898061156272888, "learning_rate": 4.8543173075731445e-06, "loss": 0.1032, "num_input_tokens_seen": 6003216, "step": 31495 }, { "epoch": 16.37214137214137, "grad_norm": 1.1331948041915894, "learning_rate": 4.847604867364766e-06, "loss": 0.1837, "num_input_tokens_seen": 6004176, "step": 31500 }, { "epoch": 16.374740124740125, "grad_norm": 0.871682345867157, "learning_rate": 4.840896572984291e-06, "loss": 0.1169, "num_input_tokens_seen": 6005104, "step": 31505 }, { "epoch": 16.37733887733888, "grad_norm": 1.5195125341415405, "learning_rate": 4.83419242581179e-06, "loss": 0.1197, "num_input_tokens_seen": 6006032, "step": 31510 }, { "epoch": 16.37993762993763, "grad_norm": 0.9646396636962891, "learning_rate": 4.827492427226454e-06, "loss": 0.0823, "num_input_tokens_seen": 6006992, "step": 31515 }, { "epoch": 16.382536382536383, "grad_norm": 0.3694233000278473, "learning_rate": 4.820796578606637e-06, "loss": 0.1667, "num_input_tokens_seen": 6007984, "step": 31520 }, { "epoch": 16.385135135135137, "grad_norm": 0.7403656244277954, "learning_rate": 4.814104881329828e-06, "loss": 0.1222, "num_input_tokens_seen": 6008912, "step": 31525 }, { "epoch": 16.387733887733887, "grad_norm": 0.21478745341300964, "learning_rate": 4.807417336772687e-06, "loss": 0.0717, "num_input_tokens_seen": 6009840, "step": 31530 }, { "epoch": 16.39033264033264, "grad_norm": 0.06244705617427826, "learning_rate": 4.800733946310981e-06, "loss": 0.158, "num_input_tokens_seen": 6010800, "step": 31535 }, { "epoch": 16.39293139293139, "grad_norm": 1.0976020097732544, "learning_rate": 4.794054711319646e-06, "loss": 0.1936, "num_input_tokens_seen": 6011792, "step": 31540 }, { "epoch": 16.395530145530145, "grad_norm": 0.8712752461433411, "learning_rate": 4.787379633172767e-06, "loss": 0.1654, "num_input_tokens_seen": 6012688, "step": 31545 }, { "epoch": 16.3981288981289, "grad_norm": 0.28226831555366516, "learning_rate": 4.780708713243565e-06, "loss": 0.1103, "num_input_tokens_seen": 6013680, "step": 31550 }, { "epoch": 16.40072765072765, "grad_norm": 0.30769971013069153, "learning_rate": 4.774041952904407e-06, "loss": 0.1493, "num_input_tokens_seen": 6014672, "step": 31555 }, { "epoch": 16.403326403326403, "grad_norm": 0.09998078644275665, "learning_rate": 4.767379353526796e-06, "loss": 0.0533, "num_input_tokens_seen": 6015632, "step": 31560 }, { "epoch": 16.405925155925157, "grad_norm": 0.3700677454471588, "learning_rate": 4.760720916481401e-06, "loss": 0.0854, "num_input_tokens_seen": 6016592, "step": 31565 }, { "epoch": 16.408523908523907, "grad_norm": 0.23156452178955078, "learning_rate": 4.7540666431380166e-06, "loss": 0.0775, "num_input_tokens_seen": 6017520, "step": 31570 }, { "epoch": 16.41112266112266, "grad_norm": 0.35607919096946716, "learning_rate": 4.747416534865581e-06, "loss": 0.0846, "num_input_tokens_seen": 6018416, "step": 31575 }, { "epoch": 16.413721413721415, "grad_norm": 0.33552002906799316, "learning_rate": 4.740770593032176e-06, "loss": 0.1156, "num_input_tokens_seen": 6019376, "step": 31580 }, { "epoch": 16.416320166320165, "grad_norm": 0.5460625290870667, "learning_rate": 4.7341288190050455e-06, "loss": 0.1764, "num_input_tokens_seen": 6020368, "step": 31585 }, { "epoch": 16.41891891891892, "grad_norm": 0.4633026123046875, "learning_rate": 4.7274912141505505e-06, "loss": 0.1609, "num_input_tokens_seen": 6021424, "step": 31590 }, { "epoch": 16.421517671517673, "grad_norm": 0.1909375637769699, "learning_rate": 4.7208577798342066e-06, "loss": 0.137, "num_input_tokens_seen": 6022352, "step": 31595 }, { "epoch": 16.424116424116423, "grad_norm": 0.6098224520683289, "learning_rate": 4.7142285174206684e-06, "loss": 0.078, "num_input_tokens_seen": 6023312, "step": 31600 }, { "epoch": 16.426715176715177, "grad_norm": 1.2164546251296997, "learning_rate": 4.707603428273735e-06, "loss": 0.1806, "num_input_tokens_seen": 6024240, "step": 31605 }, { "epoch": 16.429313929313928, "grad_norm": 0.8328974843025208, "learning_rate": 4.700982513756341e-06, "loss": 0.1189, "num_input_tokens_seen": 6025232, "step": 31610 }, { "epoch": 16.43191268191268, "grad_norm": 1.4818377494812012, "learning_rate": 4.694365775230566e-06, "loss": 0.0466, "num_input_tokens_seen": 6026192, "step": 31615 }, { "epoch": 16.434511434511435, "grad_norm": 0.15575195848941803, "learning_rate": 4.687753214057638e-06, "loss": 0.1556, "num_input_tokens_seen": 6027152, "step": 31620 }, { "epoch": 16.437110187110186, "grad_norm": 0.28805074095726013, "learning_rate": 4.681144831597914e-06, "loss": 0.0657, "num_input_tokens_seen": 6028080, "step": 31625 }, { "epoch": 16.43970893970894, "grad_norm": 1.453460931777954, "learning_rate": 4.674540629210888e-06, "loss": 0.1547, "num_input_tokens_seen": 6029040, "step": 31630 }, { "epoch": 16.442307692307693, "grad_norm": 1.3029178380966187, "learning_rate": 4.667940608255219e-06, "loss": 0.2409, "num_input_tokens_seen": 6030032, "step": 31635 }, { "epoch": 16.444906444906444, "grad_norm": 0.09307566285133362, "learning_rate": 4.661344770088677e-06, "loss": 0.1838, "num_input_tokens_seen": 6030896, "step": 31640 }, { "epoch": 16.447505197505198, "grad_norm": 0.4099239408969879, "learning_rate": 4.654753116068186e-06, "loss": 0.1299, "num_input_tokens_seen": 6031952, "step": 31645 }, { "epoch": 16.45010395010395, "grad_norm": 0.8791970610618591, "learning_rate": 4.648165647549802e-06, "loss": 0.1305, "num_input_tokens_seen": 6032944, "step": 31650 }, { "epoch": 16.4527027027027, "grad_norm": 0.2698696255683899, "learning_rate": 4.64158236588873e-06, "loss": 0.085, "num_input_tokens_seen": 6033904, "step": 31655 }, { "epoch": 16.455301455301456, "grad_norm": 0.8533824682235718, "learning_rate": 4.635003272439309e-06, "loss": 0.1249, "num_input_tokens_seen": 6034768, "step": 31660 }, { "epoch": 16.45790020790021, "grad_norm": 0.44485995173454285, "learning_rate": 4.62842836855501e-06, "loss": 0.1337, "num_input_tokens_seen": 6035696, "step": 31665 }, { "epoch": 16.46049896049896, "grad_norm": 0.6444351673126221, "learning_rate": 4.621857655588449e-06, "loss": 0.1169, "num_input_tokens_seen": 6036656, "step": 31670 }, { "epoch": 16.463097713097714, "grad_norm": 0.7343878746032715, "learning_rate": 4.615291134891381e-06, "loss": 0.062, "num_input_tokens_seen": 6037584, "step": 31675 }, { "epoch": 16.465696465696467, "grad_norm": 0.483378142118454, "learning_rate": 4.608728807814694e-06, "loss": 0.0782, "num_input_tokens_seen": 6038576, "step": 31680 }, { "epoch": 16.468295218295218, "grad_norm": 0.13222458958625793, "learning_rate": 4.602170675708406e-06, "loss": 0.0864, "num_input_tokens_seen": 6039504, "step": 31685 }, { "epoch": 16.47089397089397, "grad_norm": 0.2866605520248413, "learning_rate": 4.595616739921696e-06, "loss": 0.0935, "num_input_tokens_seen": 6040464, "step": 31690 }, { "epoch": 16.473492723492722, "grad_norm": 0.9026378393173218, "learning_rate": 4.58906700180286e-06, "loss": 0.1212, "num_input_tokens_seen": 6041360, "step": 31695 }, { "epoch": 16.476091476091476, "grad_norm": 1.4154479503631592, "learning_rate": 4.582521462699332e-06, "loss": 0.3476, "num_input_tokens_seen": 6042320, "step": 31700 }, { "epoch": 16.47869022869023, "grad_norm": 0.18107976019382477, "learning_rate": 4.5759801239576824e-06, "loss": 0.1064, "num_input_tokens_seen": 6043312, "step": 31705 }, { "epoch": 16.48128898128898, "grad_norm": 0.8847618103027344, "learning_rate": 4.569442986923631e-06, "loss": 0.1221, "num_input_tokens_seen": 6044272, "step": 31710 }, { "epoch": 16.483887733887734, "grad_norm": 0.9563705921173096, "learning_rate": 4.562910052942015e-06, "loss": 0.1467, "num_input_tokens_seen": 6045200, "step": 31715 }, { "epoch": 16.486486486486488, "grad_norm": 1.4843682050704956, "learning_rate": 4.5563813233568205e-06, "loss": 0.1059, "num_input_tokens_seen": 6046224, "step": 31720 }, { "epoch": 16.489085239085238, "grad_norm": 0.2477106750011444, "learning_rate": 4.549856799511149e-06, "loss": 0.0867, "num_input_tokens_seen": 6047184, "step": 31725 }, { "epoch": 16.491683991683992, "grad_norm": 0.18974569439888, "learning_rate": 4.543336482747274e-06, "loss": 0.0928, "num_input_tokens_seen": 6048112, "step": 31730 }, { "epoch": 16.494282744282746, "grad_norm": 0.09245600551366806, "learning_rate": 4.536820374406559e-06, "loss": 0.1217, "num_input_tokens_seen": 6048976, "step": 31735 }, { "epoch": 16.496881496881496, "grad_norm": 1.2067787647247314, "learning_rate": 4.530308475829523e-06, "loss": 0.198, "num_input_tokens_seen": 6049904, "step": 31740 }, { "epoch": 16.49948024948025, "grad_norm": 0.4952526390552521, "learning_rate": 4.523800788355834e-06, "loss": 0.0619, "num_input_tokens_seen": 6050800, "step": 31745 }, { "epoch": 16.502079002079, "grad_norm": 0.525036633014679, "learning_rate": 4.517297313324268e-06, "loss": 0.0875, "num_input_tokens_seen": 6051856, "step": 31750 }, { "epoch": 16.504677754677754, "grad_norm": 1.9541757106781006, "learning_rate": 4.51079805207274e-06, "loss": 0.1222, "num_input_tokens_seen": 6052752, "step": 31755 }, { "epoch": 16.507276507276508, "grad_norm": 0.5643918514251709, "learning_rate": 4.504303005938318e-06, "loss": 0.2562, "num_input_tokens_seen": 6053616, "step": 31760 }, { "epoch": 16.50987525987526, "grad_norm": 1.4300200939178467, "learning_rate": 4.497812176257179e-06, "loss": 0.2017, "num_input_tokens_seen": 6054576, "step": 31765 }, { "epoch": 16.512474012474012, "grad_norm": 0.5816093683242798, "learning_rate": 4.491325564364643e-06, "loss": 0.1834, "num_input_tokens_seen": 6055504, "step": 31770 }, { "epoch": 16.515072765072766, "grad_norm": 1.0035731792449951, "learning_rate": 4.4848431715951514e-06, "loss": 0.1314, "num_input_tokens_seen": 6056464, "step": 31775 }, { "epoch": 16.517671517671516, "grad_norm": 0.435835063457489, "learning_rate": 4.4783649992823e-06, "loss": 0.0774, "num_input_tokens_seen": 6057552, "step": 31780 }, { "epoch": 16.52027027027027, "grad_norm": 1.1438473463058472, "learning_rate": 4.471891048758803e-06, "loss": 0.0745, "num_input_tokens_seen": 6058448, "step": 31785 }, { "epoch": 16.522869022869024, "grad_norm": 0.24609775841236115, "learning_rate": 4.465421321356497e-06, "loss": 0.1743, "num_input_tokens_seen": 6059408, "step": 31790 }, { "epoch": 16.525467775467774, "grad_norm": 0.6898404359817505, "learning_rate": 4.458955818406365e-06, "loss": 0.0792, "num_input_tokens_seen": 6060336, "step": 31795 }, { "epoch": 16.528066528066528, "grad_norm": 0.14684629440307617, "learning_rate": 4.452494541238514e-06, "loss": 0.0368, "num_input_tokens_seen": 6061328, "step": 31800 }, { "epoch": 16.530665280665282, "grad_norm": 0.5021301507949829, "learning_rate": 4.4460374911821816e-06, "loss": 0.0573, "num_input_tokens_seen": 6062288, "step": 31805 }, { "epoch": 16.533264033264032, "grad_norm": 0.5740723609924316, "learning_rate": 4.4395846695657325e-06, "loss": 0.0611, "num_input_tokens_seen": 6063216, "step": 31810 }, { "epoch": 16.535862785862786, "grad_norm": 0.5899350047111511, "learning_rate": 4.4331360777166765e-06, "loss": 0.0837, "num_input_tokens_seen": 6064176, "step": 31815 }, { "epoch": 16.53846153846154, "grad_norm": 0.3557874858379364, "learning_rate": 4.426691716961637e-06, "loss": 0.0207, "num_input_tokens_seen": 6065104, "step": 31820 }, { "epoch": 16.54106029106029, "grad_norm": 0.8537569642066956, "learning_rate": 4.420251588626373e-06, "loss": 0.1614, "num_input_tokens_seen": 6066064, "step": 31825 }, { "epoch": 16.543659043659044, "grad_norm": 0.17367340624332428, "learning_rate": 4.413815694035766e-06, "loss": 0.184, "num_input_tokens_seen": 6066992, "step": 31830 }, { "epoch": 16.546257796257795, "grad_norm": 0.670467734336853, "learning_rate": 4.407384034513845e-06, "loss": 0.1876, "num_input_tokens_seen": 6067920, "step": 31835 }, { "epoch": 16.54885654885655, "grad_norm": 1.8879507780075073, "learning_rate": 4.400956611383747e-06, "loss": 0.1643, "num_input_tokens_seen": 6068880, "step": 31840 }, { "epoch": 16.551455301455302, "grad_norm": 1.0181779861450195, "learning_rate": 4.394533425967751e-06, "loss": 0.3654, "num_input_tokens_seen": 6069872, "step": 31845 }, { "epoch": 16.554054054054053, "grad_norm": 1.6686276197433472, "learning_rate": 4.388114479587252e-06, "loss": 0.2746, "num_input_tokens_seen": 6070832, "step": 31850 }, { "epoch": 16.556652806652806, "grad_norm": 0.7478066682815552, "learning_rate": 4.381699773562789e-06, "loss": 0.1382, "num_input_tokens_seen": 6071728, "step": 31855 }, { "epoch": 16.55925155925156, "grad_norm": 0.2993614971637726, "learning_rate": 4.375289309214023e-06, "loss": 0.1423, "num_input_tokens_seen": 6072688, "step": 31860 }, { "epoch": 16.56185031185031, "grad_norm": 0.1113174632191658, "learning_rate": 4.368883087859721e-06, "loss": 0.083, "num_input_tokens_seen": 6073616, "step": 31865 }, { "epoch": 16.564449064449065, "grad_norm": 1.8378875255584717, "learning_rate": 4.3624811108178125e-06, "loss": 0.2707, "num_input_tokens_seen": 6074512, "step": 31870 }, { "epoch": 16.56704781704782, "grad_norm": 0.9372697472572327, "learning_rate": 4.3560833794053295e-06, "loss": 0.2503, "num_input_tokens_seen": 6075440, "step": 31875 }, { "epoch": 16.56964656964657, "grad_norm": 0.9506142139434814, "learning_rate": 4.349689894938433e-06, "loss": 0.1268, "num_input_tokens_seen": 6076400, "step": 31880 }, { "epoch": 16.572245322245323, "grad_norm": 0.10935951769351959, "learning_rate": 4.34330065873243e-06, "loss": 0.1518, "num_input_tokens_seen": 6077360, "step": 31885 }, { "epoch": 16.574844074844076, "grad_norm": 0.08561049401760101, "learning_rate": 4.336915672101727e-06, "loss": 0.058, "num_input_tokens_seen": 6078224, "step": 31890 }, { "epoch": 16.577442827442827, "grad_norm": 0.11939907819032669, "learning_rate": 4.330534936359873e-06, "loss": 0.0988, "num_input_tokens_seen": 6079184, "step": 31895 }, { "epoch": 16.58004158004158, "grad_norm": 0.2651304304599762, "learning_rate": 4.3241584528195295e-06, "loss": 0.087, "num_input_tokens_seen": 6080144, "step": 31900 }, { "epoch": 16.58264033264033, "grad_norm": 0.3533170223236084, "learning_rate": 4.317786222792502e-06, "loss": 0.1108, "num_input_tokens_seen": 6081072, "step": 31905 }, { "epoch": 16.585239085239085, "grad_norm": 0.04873672500252724, "learning_rate": 4.311418247589705e-06, "loss": 0.0567, "num_input_tokens_seen": 6082000, "step": 31910 }, { "epoch": 16.58783783783784, "grad_norm": 1.2068967819213867, "learning_rate": 4.305054528521183e-06, "loss": 0.1743, "num_input_tokens_seen": 6082992, "step": 31915 }, { "epoch": 16.59043659043659, "grad_norm": 0.7453649044036865, "learning_rate": 4.2986950668960996e-06, "loss": 0.0979, "num_input_tokens_seen": 6083888, "step": 31920 }, { "epoch": 16.593035343035343, "grad_norm": 1.4147424697875977, "learning_rate": 4.2923398640227625e-06, "loss": 0.1146, "num_input_tokens_seen": 6084784, "step": 31925 }, { "epoch": 16.595634095634097, "grad_norm": 0.8023940324783325, "learning_rate": 4.285988921208573e-06, "loss": 0.1002, "num_input_tokens_seen": 6085776, "step": 31930 }, { "epoch": 16.598232848232847, "grad_norm": 0.2246508002281189, "learning_rate": 4.279642239760071e-06, "loss": 0.1523, "num_input_tokens_seen": 6086704, "step": 31935 }, { "epoch": 16.6008316008316, "grad_norm": 0.3022386133670807, "learning_rate": 4.273299820982932e-06, "loss": 0.1951, "num_input_tokens_seen": 6087632, "step": 31940 }, { "epoch": 16.603430353430355, "grad_norm": 0.8019508123397827, "learning_rate": 4.266961666181935e-06, "loss": 0.2305, "num_input_tokens_seen": 6088560, "step": 31945 }, { "epoch": 16.606029106029105, "grad_norm": 0.8527100086212158, "learning_rate": 4.260627776660992e-06, "loss": 0.0814, "num_input_tokens_seen": 6089520, "step": 31950 }, { "epoch": 16.60862785862786, "grad_norm": 0.6132376194000244, "learning_rate": 4.254298153723127e-06, "loss": 0.2083, "num_input_tokens_seen": 6090512, "step": 31955 }, { "epoch": 16.611226611226613, "grad_norm": 0.832948625087738, "learning_rate": 4.24797279867051e-06, "loss": 0.2255, "num_input_tokens_seen": 6091440, "step": 31960 }, { "epoch": 16.613825363825363, "grad_norm": 1.047777771949768, "learning_rate": 4.241651712804407e-06, "loss": 0.1493, "num_input_tokens_seen": 6092528, "step": 31965 }, { "epoch": 16.616424116424117, "grad_norm": 0.12294018268585205, "learning_rate": 4.235334897425216e-06, "loss": 0.0874, "num_input_tokens_seen": 6093488, "step": 31970 }, { "epoch": 16.61902286902287, "grad_norm": 0.24754777550697327, "learning_rate": 4.229022353832454e-06, "loss": 0.1387, "num_input_tokens_seen": 6094544, "step": 31975 }, { "epoch": 16.62162162162162, "grad_norm": 0.36539849638938904, "learning_rate": 4.222714083324769e-06, "loss": 0.1584, "num_input_tokens_seen": 6095440, "step": 31980 }, { "epoch": 16.624220374220375, "grad_norm": 0.2525358200073242, "learning_rate": 4.216410087199921e-06, "loss": 0.053, "num_input_tokens_seen": 6096400, "step": 31985 }, { "epoch": 16.626819126819125, "grad_norm": 1.7504040002822876, "learning_rate": 4.2101103667547905e-06, "loss": 0.1342, "num_input_tokens_seen": 6097392, "step": 31990 }, { "epoch": 16.62941787941788, "grad_norm": 0.8995101451873779, "learning_rate": 4.203814923285379e-06, "loss": 0.1785, "num_input_tokens_seen": 6098352, "step": 31995 }, { "epoch": 16.632016632016633, "grad_norm": 0.12122387439012527, "learning_rate": 4.197523758086813e-06, "loss": 0.236, "num_input_tokens_seen": 6099376, "step": 32000 }, { "epoch": 16.634615384615383, "grad_norm": 0.6748782396316528, "learning_rate": 4.1912368724533216e-06, "loss": 0.146, "num_input_tokens_seen": 6100304, "step": 32005 }, { "epoch": 16.637214137214137, "grad_norm": 1.6252480745315552, "learning_rate": 4.184954267678287e-06, "loss": 0.2173, "num_input_tokens_seen": 6101200, "step": 32010 }, { "epoch": 16.63981288981289, "grad_norm": 0.8934807777404785, "learning_rate": 4.178675945054181e-06, "loss": 0.1421, "num_input_tokens_seen": 6102064, "step": 32015 }, { "epoch": 16.64241164241164, "grad_norm": 0.9668917059898376, "learning_rate": 4.172401905872605e-06, "loss": 0.244, "num_input_tokens_seen": 6103056, "step": 32020 }, { "epoch": 16.645010395010395, "grad_norm": 0.1512719690799713, "learning_rate": 4.16613215142427e-06, "loss": 0.1499, "num_input_tokens_seen": 6104016, "step": 32025 }, { "epoch": 16.64760914760915, "grad_norm": 1.4452855587005615, "learning_rate": 4.15986668299903e-06, "loss": 0.1585, "num_input_tokens_seen": 6104976, "step": 32030 }, { "epoch": 16.6502079002079, "grad_norm": 0.7831330895423889, "learning_rate": 4.153605501885835e-06, "loss": 0.2672, "num_input_tokens_seen": 6105936, "step": 32035 }, { "epoch": 16.652806652806653, "grad_norm": 0.39757663011550903, "learning_rate": 4.1473486093727535e-06, "loss": 0.123, "num_input_tokens_seen": 6106896, "step": 32040 }, { "epoch": 16.655405405405407, "grad_norm": 1.379766821861267, "learning_rate": 4.141096006746975e-06, "loss": 0.1792, "num_input_tokens_seen": 6107856, "step": 32045 }, { "epoch": 16.658004158004157, "grad_norm": 0.9864805340766907, "learning_rate": 4.134847695294825e-06, "loss": 0.1873, "num_input_tokens_seen": 6108784, "step": 32050 }, { "epoch": 16.66060291060291, "grad_norm": 0.8454635143280029, "learning_rate": 4.128603676301723e-06, "loss": 0.2046, "num_input_tokens_seen": 6109776, "step": 32055 }, { "epoch": 16.66320166320166, "grad_norm": 1.3783996105194092, "learning_rate": 4.122363951052197e-06, "loss": 0.1829, "num_input_tokens_seen": 6110704, "step": 32060 }, { "epoch": 16.665800415800415, "grad_norm": 0.5902752876281738, "learning_rate": 4.116128520829926e-06, "loss": 0.1382, "num_input_tokens_seen": 6111664, "step": 32065 }, { "epoch": 16.66839916839917, "grad_norm": 1.2049301862716675, "learning_rate": 4.10989738691768e-06, "loss": 0.1649, "num_input_tokens_seen": 6112592, "step": 32070 }, { "epoch": 16.67099792099792, "grad_norm": 0.8622502088546753, "learning_rate": 4.103670550597355e-06, "loss": 0.0982, "num_input_tokens_seen": 6113584, "step": 32075 }, { "epoch": 16.673596673596673, "grad_norm": 1.5400137901306152, "learning_rate": 4.097448013149949e-06, "loss": 0.2221, "num_input_tokens_seen": 6114576, "step": 32080 }, { "epoch": 16.676195426195427, "grad_norm": 0.7239350080490112, "learning_rate": 4.091229775855598e-06, "loss": 0.1037, "num_input_tokens_seen": 6115504, "step": 32085 }, { "epoch": 16.678794178794178, "grad_norm": 0.3837830722332001, "learning_rate": 4.085015839993539e-06, "loss": 0.2041, "num_input_tokens_seen": 6116560, "step": 32090 }, { "epoch": 16.68139293139293, "grad_norm": 0.256271630525589, "learning_rate": 4.078806206842126e-06, "loss": 0.0353, "num_input_tokens_seen": 6117424, "step": 32095 }, { "epoch": 16.683991683991685, "grad_norm": 0.49288278818130493, "learning_rate": 4.07260087767882e-06, "loss": 0.1153, "num_input_tokens_seen": 6118352, "step": 32100 }, { "epoch": 16.686590436590436, "grad_norm": 1.8476849794387817, "learning_rate": 4.06639985378022e-06, "loss": 0.2455, "num_input_tokens_seen": 6119376, "step": 32105 }, { "epoch": 16.68918918918919, "grad_norm": 0.1889871209859848, "learning_rate": 4.0602031364220164e-06, "loss": 0.1138, "num_input_tokens_seen": 6120336, "step": 32110 }, { "epoch": 16.691787941787943, "grad_norm": 1.5655927658081055, "learning_rate": 4.054010726879015e-06, "loss": 0.1482, "num_input_tokens_seen": 6121296, "step": 32115 }, { "epoch": 16.694386694386694, "grad_norm": 1.5116461515426636, "learning_rate": 4.047822626425163e-06, "loss": 0.1401, "num_input_tokens_seen": 6122224, "step": 32120 }, { "epoch": 16.696985446985448, "grad_norm": 0.20782507956027985, "learning_rate": 4.0416388363334805e-06, "loss": 0.0716, "num_input_tokens_seen": 6123120, "step": 32125 }, { "epoch": 16.6995841995842, "grad_norm": 1.5920544862747192, "learning_rate": 4.0354593578761205e-06, "loss": 0.1979, "num_input_tokens_seen": 6124112, "step": 32130 }, { "epoch": 16.70218295218295, "grad_norm": 0.4230036437511444, "learning_rate": 4.029284192324362e-06, "loss": 0.144, "num_input_tokens_seen": 6125040, "step": 32135 }, { "epoch": 16.704781704781706, "grad_norm": 1.1714003086090088, "learning_rate": 4.0231133409485754e-06, "loss": 0.1684, "num_input_tokens_seen": 6126032, "step": 32140 }, { "epoch": 16.707380457380456, "grad_norm": 0.7730517387390137, "learning_rate": 4.016946805018254e-06, "loss": 0.1673, "num_input_tokens_seen": 6126960, "step": 32145 }, { "epoch": 16.70997920997921, "grad_norm": 0.5074828863143921, "learning_rate": 4.010784585801994e-06, "loss": 0.1056, "num_input_tokens_seen": 6127952, "step": 32150 }, { "epoch": 16.712577962577964, "grad_norm": 0.11872649937868118, "learning_rate": 4.004626684567523e-06, "loss": 0.1211, "num_input_tokens_seen": 6128880, "step": 32155 }, { "epoch": 16.715176715176714, "grad_norm": 0.9772204756736755, "learning_rate": 3.99847310258166e-06, "loss": 0.0903, "num_input_tokens_seen": 6129872, "step": 32160 }, { "epoch": 16.717775467775468, "grad_norm": 1.0776242017745972, "learning_rate": 3.992323841110348e-06, "loss": 0.1283, "num_input_tokens_seen": 6130864, "step": 32165 }, { "epoch": 16.72037422037422, "grad_norm": 0.22460350394248962, "learning_rate": 3.986178901418627e-06, "loss": 0.1638, "num_input_tokens_seen": 6131824, "step": 32170 }, { "epoch": 16.722972972972972, "grad_norm": 0.871944010257721, "learning_rate": 3.980038284770671e-06, "loss": 0.1285, "num_input_tokens_seen": 6132752, "step": 32175 }, { "epoch": 16.725571725571726, "grad_norm": 0.9760953187942505, "learning_rate": 3.973901992429746e-06, "loss": 0.2188, "num_input_tokens_seen": 6133680, "step": 32180 }, { "epoch": 16.72817047817048, "grad_norm": 1.3387832641601562, "learning_rate": 3.967770025658232e-06, "loss": 0.1413, "num_input_tokens_seen": 6134736, "step": 32185 }, { "epoch": 16.73076923076923, "grad_norm": 0.2597719430923462, "learning_rate": 3.96164238571762e-06, "loss": 0.0989, "num_input_tokens_seen": 6135632, "step": 32190 }, { "epoch": 16.733367983367984, "grad_norm": 1.0435378551483154, "learning_rate": 3.955519073868513e-06, "loss": 0.2616, "num_input_tokens_seen": 6136592, "step": 32195 }, { "epoch": 16.735966735966738, "grad_norm": 0.25458160042762756, "learning_rate": 3.949400091370623e-06, "loss": 0.1448, "num_input_tokens_seen": 6137552, "step": 32200 }, { "epoch": 16.738565488565488, "grad_norm": 0.4842706322669983, "learning_rate": 3.943285439482763e-06, "loss": 0.0561, "num_input_tokens_seen": 6138448, "step": 32205 }, { "epoch": 16.741164241164242, "grad_norm": 0.8241041302680969, "learning_rate": 3.937175119462874e-06, "loss": 0.1871, "num_input_tokens_seen": 6139472, "step": 32210 }, { "epoch": 16.743762993762992, "grad_norm": 0.762697160243988, "learning_rate": 3.931069132567991e-06, "loss": 0.1577, "num_input_tokens_seen": 6140368, "step": 32215 }, { "epoch": 16.746361746361746, "grad_norm": 0.8984560966491699, "learning_rate": 3.924967480054256e-06, "loss": 0.1389, "num_input_tokens_seen": 6141296, "step": 32220 }, { "epoch": 16.7489604989605, "grad_norm": 0.7568299770355225, "learning_rate": 3.918870163176924e-06, "loss": 0.2107, "num_input_tokens_seen": 6142256, "step": 32225 }, { "epoch": 16.75155925155925, "grad_norm": 0.46646326780319214, "learning_rate": 3.912777183190369e-06, "loss": 0.1387, "num_input_tokens_seen": 6143312, "step": 32230 }, { "epoch": 16.754158004158004, "grad_norm": 1.3247989416122437, "learning_rate": 3.906688541348055e-06, "loss": 0.1974, "num_input_tokens_seen": 6144272, "step": 32235 }, { "epoch": 16.756756756756758, "grad_norm": 0.5103265047073364, "learning_rate": 3.900604238902556e-06, "loss": 0.1287, "num_input_tokens_seen": 6145232, "step": 32240 }, { "epoch": 16.759355509355508, "grad_norm": 1.637128472328186, "learning_rate": 3.894524277105565e-06, "loss": 0.0648, "num_input_tokens_seen": 6146192, "step": 32245 }, { "epoch": 16.761954261954262, "grad_norm": 1.704132080078125, "learning_rate": 3.888448657207883e-06, "loss": 0.1575, "num_input_tokens_seen": 6147120, "step": 32250 }, { "epoch": 16.764553014553016, "grad_norm": 1.3691757917404175, "learning_rate": 3.8823773804593875e-06, "loss": 0.2861, "num_input_tokens_seen": 6148048, "step": 32255 }, { "epoch": 16.767151767151766, "grad_norm": 0.3538389801979065, "learning_rate": 3.876310448109102e-06, "loss": 0.1042, "num_input_tokens_seen": 6149104, "step": 32260 }, { "epoch": 16.76975051975052, "grad_norm": 0.41298308968544006, "learning_rate": 3.8702478614051355e-06, "loss": 0.0464, "num_input_tokens_seen": 6150032, "step": 32265 }, { "epoch": 16.772349272349274, "grad_norm": 1.3212231397628784, "learning_rate": 3.864189621594702e-06, "loss": 0.1136, "num_input_tokens_seen": 6150992, "step": 32270 }, { "epoch": 16.774948024948024, "grad_norm": 0.145234614610672, "learning_rate": 3.8581357299241255e-06, "loss": 0.0616, "num_input_tokens_seen": 6151984, "step": 32275 }, { "epoch": 16.777546777546778, "grad_norm": 0.15578855574131012, "learning_rate": 3.852086187638846e-06, "loss": 0.1409, "num_input_tokens_seen": 6152944, "step": 32280 }, { "epoch": 16.78014553014553, "grad_norm": 0.6504104733467102, "learning_rate": 3.846040995983391e-06, "loss": 0.2007, "num_input_tokens_seen": 6153904, "step": 32285 }, { "epoch": 16.782744282744282, "grad_norm": 0.06617807596921921, "learning_rate": 3.840000156201401e-06, "loss": 0.1061, "num_input_tokens_seen": 6154864, "step": 32290 }, { "epoch": 16.785343035343036, "grad_norm": 0.9233967065811157, "learning_rate": 3.833963669535615e-06, "loss": 0.1093, "num_input_tokens_seen": 6155728, "step": 32295 }, { "epoch": 16.787941787941786, "grad_norm": 0.7937437295913696, "learning_rate": 3.827931537227894e-06, "loss": 0.1167, "num_input_tokens_seen": 6156656, "step": 32300 }, { "epoch": 16.79054054054054, "grad_norm": 0.6827577948570251, "learning_rate": 3.821903760519188e-06, "loss": 0.1637, "num_input_tokens_seen": 6157616, "step": 32305 }, { "epoch": 16.793139293139294, "grad_norm": 0.625095546245575, "learning_rate": 3.815880340649549e-06, "loss": 0.0548, "num_input_tokens_seen": 6158480, "step": 32310 }, { "epoch": 16.795738045738045, "grad_norm": 0.44552284479141235, "learning_rate": 3.8098612788581427e-06, "loss": 0.167, "num_input_tokens_seen": 6159408, "step": 32315 }, { "epoch": 16.7983367983368, "grad_norm": 1.4267833232879639, "learning_rate": 3.8038465763832298e-06, "loss": 0.1973, "num_input_tokens_seen": 6160464, "step": 32320 }, { "epoch": 16.800935550935552, "grad_norm": 0.212645024061203, "learning_rate": 3.797836234462182e-06, "loss": 0.0351, "num_input_tokens_seen": 6161360, "step": 32325 }, { "epoch": 16.803534303534303, "grad_norm": 0.33237552642822266, "learning_rate": 3.7918302543314606e-06, "loss": 0.0806, "num_input_tokens_seen": 6162288, "step": 32330 }, { "epoch": 16.806133056133056, "grad_norm": 1.0820971727371216, "learning_rate": 3.7858286372266515e-06, "loss": 0.1825, "num_input_tokens_seen": 6163376, "step": 32335 }, { "epoch": 16.80873180873181, "grad_norm": 0.48406335711479187, "learning_rate": 3.7798313843824237e-06, "loss": 0.1167, "num_input_tokens_seen": 6164336, "step": 32340 }, { "epoch": 16.81133056133056, "grad_norm": 0.16942380368709564, "learning_rate": 3.7738384970325586e-06, "loss": 0.0291, "num_input_tokens_seen": 6165232, "step": 32345 }, { "epoch": 16.813929313929314, "grad_norm": 0.9063984155654907, "learning_rate": 3.767849976409926e-06, "loss": 0.1228, "num_input_tokens_seen": 6166192, "step": 32350 }, { "epoch": 16.816528066528065, "grad_norm": 0.530369758605957, "learning_rate": 3.7618658237465216e-06, "loss": 0.1026, "num_input_tokens_seen": 6167120, "step": 32355 }, { "epoch": 16.81912681912682, "grad_norm": 0.6001789569854736, "learning_rate": 3.7558860402734206e-06, "loss": 0.1047, "num_input_tokens_seen": 6168080, "step": 32360 }, { "epoch": 16.821725571725572, "grad_norm": 0.38160303235054016, "learning_rate": 3.749910627220801e-06, "loss": 0.1601, "num_input_tokens_seen": 6169008, "step": 32365 }, { "epoch": 16.824324324324323, "grad_norm": 0.391645610332489, "learning_rate": 3.7439395858179625e-06, "loss": 0.127, "num_input_tokens_seen": 6170032, "step": 32370 }, { "epoch": 16.826923076923077, "grad_norm": 0.8681654930114746, "learning_rate": 3.737972917293281e-06, "loss": 0.0919, "num_input_tokens_seen": 6170992, "step": 32375 }, { "epoch": 16.82952182952183, "grad_norm": 0.40074416995048523, "learning_rate": 3.7320106228742462e-06, "loss": 0.128, "num_input_tokens_seen": 6171888, "step": 32380 }, { "epoch": 16.83212058212058, "grad_norm": 0.8897218704223633, "learning_rate": 3.726052703787439e-06, "loss": 0.1545, "num_input_tokens_seen": 6172848, "step": 32385 }, { "epoch": 16.834719334719335, "grad_norm": 0.7205590009689331, "learning_rate": 3.72009916125855e-06, "loss": 0.1242, "num_input_tokens_seen": 6173776, "step": 32390 }, { "epoch": 16.83731808731809, "grad_norm": 0.4417761564254761, "learning_rate": 3.7141499965123616e-06, "loss": 0.2496, "num_input_tokens_seen": 6174768, "step": 32395 }, { "epoch": 16.83991683991684, "grad_norm": 1.344186782836914, "learning_rate": 3.708205210772753e-06, "loss": 0.1342, "num_input_tokens_seen": 6175728, "step": 32400 }, { "epoch": 16.842515592515593, "grad_norm": 0.7516773343086243, "learning_rate": 3.7022648052627228e-06, "loss": 0.1701, "num_input_tokens_seen": 6176624, "step": 32405 }, { "epoch": 16.845114345114347, "grad_norm": 0.11247220635414124, "learning_rate": 3.6963287812043473e-06, "loss": 0.0665, "num_input_tokens_seen": 6177584, "step": 32410 }, { "epoch": 16.847713097713097, "grad_norm": 0.4752449095249176, "learning_rate": 3.6903971398188074e-06, "loss": 0.1164, "num_input_tokens_seen": 6178512, "step": 32415 }, { "epoch": 16.85031185031185, "grad_norm": 0.5880224704742432, "learning_rate": 3.6844698823263748e-06, "loss": 0.1096, "num_input_tokens_seen": 6179440, "step": 32420 }, { "epoch": 16.852910602910605, "grad_norm": 0.20548968017101288, "learning_rate": 3.678547009946445e-06, "loss": 0.0788, "num_input_tokens_seen": 6180336, "step": 32425 }, { "epoch": 16.855509355509355, "grad_norm": 0.10166864097118378, "learning_rate": 3.672628523897481e-06, "loss": 0.103, "num_input_tokens_seen": 6181264, "step": 32430 }, { "epoch": 16.85810810810811, "grad_norm": 1.3012464046478271, "learning_rate": 3.6667144253970637e-06, "loss": 0.0706, "num_input_tokens_seen": 6182192, "step": 32435 }, { "epoch": 16.86070686070686, "grad_norm": 0.8108916282653809, "learning_rate": 3.660804715661853e-06, "loss": 0.1193, "num_input_tokens_seen": 6183152, "step": 32440 }, { "epoch": 16.863305613305613, "grad_norm": 0.24071452021598816, "learning_rate": 3.654899395907632e-06, "loss": 0.1728, "num_input_tokens_seen": 6184144, "step": 32445 }, { "epoch": 16.865904365904367, "grad_norm": 0.7918195724487305, "learning_rate": 3.6489984673492628e-06, "loss": 0.1906, "num_input_tokens_seen": 6185104, "step": 32450 }, { "epoch": 16.868503118503117, "grad_norm": 1.2328349351882935, "learning_rate": 3.6431019312006903e-06, "loss": 0.1998, "num_input_tokens_seen": 6186032, "step": 32455 }, { "epoch": 16.87110187110187, "grad_norm": 0.31813931465148926, "learning_rate": 3.6372097886749917e-06, "loss": 0.1578, "num_input_tokens_seen": 6187056, "step": 32460 }, { "epoch": 16.873700623700625, "grad_norm": 0.10983458161354065, "learning_rate": 3.631322040984317e-06, "loss": 0.0454, "num_input_tokens_seen": 6188048, "step": 32465 }, { "epoch": 16.876299376299375, "grad_norm": 1.0768029689788818, "learning_rate": 3.6254386893399077e-06, "loss": 0.1423, "num_input_tokens_seen": 6189008, "step": 32470 }, { "epoch": 16.87889812889813, "grad_norm": 0.490596741437912, "learning_rate": 3.619559734952113e-06, "loss": 0.1369, "num_input_tokens_seen": 6189968, "step": 32475 }, { "epoch": 16.881496881496883, "grad_norm": 0.019368743523955345, "learning_rate": 3.613685179030382e-06, "loss": 0.0861, "num_input_tokens_seen": 6190864, "step": 32480 }, { "epoch": 16.884095634095633, "grad_norm": 0.18221844732761383, "learning_rate": 3.607815022783245e-06, "loss": 0.1452, "num_input_tokens_seen": 6191920, "step": 32485 }, { "epoch": 16.886694386694387, "grad_norm": 0.4861295223236084, "learning_rate": 3.6019492674183253e-06, "loss": 0.2113, "num_input_tokens_seen": 6192848, "step": 32490 }, { "epoch": 16.88929313929314, "grad_norm": 0.7124122977256775, "learning_rate": 3.5960879141423627e-06, "loss": 0.0505, "num_input_tokens_seen": 6193840, "step": 32495 }, { "epoch": 16.89189189189189, "grad_norm": 0.09624738246202469, "learning_rate": 3.5902309641611705e-06, "loss": 0.0207, "num_input_tokens_seen": 6194768, "step": 32500 }, { "epoch": 16.894490644490645, "grad_norm": 0.8413965702056885, "learning_rate": 3.5843784186796654e-06, "loss": 0.1159, "num_input_tokens_seen": 6195728, "step": 32505 }, { "epoch": 16.897089397089395, "grad_norm": 0.10954112559556961, "learning_rate": 3.5785302789018454e-06, "loss": 0.1147, "num_input_tokens_seen": 6196656, "step": 32510 }, { "epoch": 16.89968814968815, "grad_norm": 1.0443977117538452, "learning_rate": 3.572686546030832e-06, "loss": 0.1095, "num_input_tokens_seen": 6197584, "step": 32515 }, { "epoch": 16.902286902286903, "grad_norm": 2.152625799179077, "learning_rate": 3.5668472212688016e-06, "loss": 0.1317, "num_input_tokens_seen": 6198544, "step": 32520 }, { "epoch": 16.904885654885653, "grad_norm": 0.7164870500564575, "learning_rate": 3.5610123058170437e-06, "loss": 0.1305, "num_input_tokens_seen": 6199536, "step": 32525 }, { "epoch": 16.907484407484407, "grad_norm": 1.178783893585205, "learning_rate": 3.5551818008759506e-06, "loss": 0.1927, "num_input_tokens_seen": 6200528, "step": 32530 }, { "epoch": 16.91008316008316, "grad_norm": 0.30084794759750366, "learning_rate": 3.5493557076449912e-06, "loss": 0.1262, "num_input_tokens_seen": 6201552, "step": 32535 }, { "epoch": 16.91268191268191, "grad_norm": 0.9376965761184692, "learning_rate": 3.5435340273227313e-06, "loss": 0.1613, "num_input_tokens_seen": 6202480, "step": 32540 }, { "epoch": 16.915280665280665, "grad_norm": 1.0925172567367554, "learning_rate": 3.537716761106821e-06, "loss": 0.2923, "num_input_tokens_seen": 6203504, "step": 32545 }, { "epoch": 16.91787941787942, "grad_norm": 0.8577103018760681, "learning_rate": 3.531903910194026e-06, "loss": 0.1845, "num_input_tokens_seen": 6204464, "step": 32550 }, { "epoch": 16.92047817047817, "grad_norm": 0.8848347067832947, "learning_rate": 3.5260954757801807e-06, "loss": 0.1912, "num_input_tokens_seen": 6205488, "step": 32555 }, { "epoch": 16.923076923076923, "grad_norm": 0.7030035853385925, "learning_rate": 3.520291459060218e-06, "loss": 0.1414, "num_input_tokens_seen": 6206512, "step": 32560 }, { "epoch": 16.925675675675677, "grad_norm": 2.1838624477386475, "learning_rate": 3.5144918612281585e-06, "loss": 0.1104, "num_input_tokens_seen": 6207472, "step": 32565 }, { "epoch": 16.928274428274428, "grad_norm": 1.036232829093933, "learning_rate": 3.508696683477128e-06, "loss": 0.1088, "num_input_tokens_seen": 6208432, "step": 32570 }, { "epoch": 16.93087318087318, "grad_norm": 1.7626045942306519, "learning_rate": 3.5029059269993253e-06, "loss": 0.1685, "num_input_tokens_seen": 6209392, "step": 32575 }, { "epoch": 16.933471933471935, "grad_norm": 0.07487466186285019, "learning_rate": 3.497119592986051e-06, "loss": 0.1877, "num_input_tokens_seen": 6210320, "step": 32580 }, { "epoch": 16.936070686070686, "grad_norm": 0.36145463585853577, "learning_rate": 3.491337682627685e-06, "loss": 0.0329, "num_input_tokens_seen": 6211280, "step": 32585 }, { "epoch": 16.93866943866944, "grad_norm": 0.17745095491409302, "learning_rate": 3.485560197113713e-06, "loss": 0.09, "num_input_tokens_seen": 6212240, "step": 32590 }, { "epoch": 16.94126819126819, "grad_norm": 0.32549792528152466, "learning_rate": 3.4797871376326925e-06, "loss": 0.1311, "num_input_tokens_seen": 6213200, "step": 32595 }, { "epoch": 16.943866943866944, "grad_norm": 0.24371899664402008, "learning_rate": 3.4740185053722814e-06, "loss": 0.1317, "num_input_tokens_seen": 6214064, "step": 32600 }, { "epoch": 16.946465696465697, "grad_norm": 0.7271033525466919, "learning_rate": 3.4682543015192333e-06, "loss": 0.1235, "num_input_tokens_seen": 6215056, "step": 32605 }, { "epoch": 16.949064449064448, "grad_norm": 0.24482136964797974, "learning_rate": 3.4624945272593747e-06, "loss": 0.1719, "num_input_tokens_seen": 6216048, "step": 32610 }, { "epoch": 16.9516632016632, "grad_norm": 0.116875559091568, "learning_rate": 3.4567391837776243e-06, "loss": 0.1252, "num_input_tokens_seen": 6217136, "step": 32615 }, { "epoch": 16.954261954261955, "grad_norm": 0.23213844001293182, "learning_rate": 3.4509882722580044e-06, "loss": 0.1547, "num_input_tokens_seen": 6218064, "step": 32620 }, { "epoch": 16.956860706860706, "grad_norm": 0.19849145412445068, "learning_rate": 3.4452417938836107e-06, "loss": 0.1108, "num_input_tokens_seen": 6218960, "step": 32625 }, { "epoch": 16.95945945945946, "grad_norm": 0.7776549458503723, "learning_rate": 3.4394997498366277e-06, "loss": 0.0554, "num_input_tokens_seen": 6219984, "step": 32630 }, { "epoch": 16.962058212058214, "grad_norm": 0.9562863707542419, "learning_rate": 3.4337621412983274e-06, "loss": 0.1209, "num_input_tokens_seen": 6220944, "step": 32635 }, { "epoch": 16.964656964656964, "grad_norm": 0.22779928147792816, "learning_rate": 3.428028969449082e-06, "loss": 0.1821, "num_input_tokens_seen": 6221936, "step": 32640 }, { "epoch": 16.967255717255718, "grad_norm": 0.47250717878341675, "learning_rate": 3.422300235468345e-06, "loss": 0.0836, "num_input_tokens_seen": 6222928, "step": 32645 }, { "epoch": 16.96985446985447, "grad_norm": 0.2756491005420685, "learning_rate": 3.4165759405346303e-06, "loss": 0.0751, "num_input_tokens_seen": 6223920, "step": 32650 }, { "epoch": 16.972453222453222, "grad_norm": 0.7403966784477234, "learning_rate": 3.4108560858255866e-06, "loss": 0.1177, "num_input_tokens_seen": 6224880, "step": 32655 }, { "epoch": 16.975051975051976, "grad_norm": 0.13836775720119476, "learning_rate": 3.405140672517912e-06, "loss": 0.098, "num_input_tokens_seen": 6225840, "step": 32660 }, { "epoch": 16.977650727650726, "grad_norm": 1.4505277872085571, "learning_rate": 3.399429701787407e-06, "loss": 0.1018, "num_input_tokens_seen": 6226800, "step": 32665 }, { "epoch": 16.98024948024948, "grad_norm": 0.24058754742145538, "learning_rate": 3.3937231748089493e-06, "loss": 0.1344, "num_input_tokens_seen": 6227792, "step": 32670 }, { "epoch": 16.982848232848234, "grad_norm": 0.2845885753631592, "learning_rate": 3.388021092756516e-06, "loss": 0.1756, "num_input_tokens_seen": 6228784, "step": 32675 }, { "epoch": 16.985446985446984, "grad_norm": 0.48349258303642273, "learning_rate": 3.3823234568031547e-06, "loss": 0.1885, "num_input_tokens_seen": 6229648, "step": 32680 }, { "epoch": 16.988045738045738, "grad_norm": 0.7345679998397827, "learning_rate": 3.376630268121009e-06, "loss": 0.2153, "num_input_tokens_seen": 6230576, "step": 32685 }, { "epoch": 16.990644490644492, "grad_norm": 1.674645185470581, "learning_rate": 3.370941527881297e-06, "loss": 0.1301, "num_input_tokens_seen": 6231472, "step": 32690 }, { "epoch": 16.993243243243242, "grad_norm": 0.5152134895324707, "learning_rate": 3.365257237254335e-06, "loss": 0.1214, "num_input_tokens_seen": 6232432, "step": 32695 }, { "epoch": 16.995841995841996, "grad_norm": 0.33407530188560486, "learning_rate": 3.3595773974095163e-06, "loss": 0.0362, "num_input_tokens_seen": 6233392, "step": 32700 }, { "epoch": 16.99844074844075, "grad_norm": 1.1630423069000244, "learning_rate": 3.353902009515317e-06, "loss": 0.1196, "num_input_tokens_seen": 6234384, "step": 32705 }, { "epoch": 17.0, "eval_loss": 0.14667558670043945, "eval_runtime": 8.0239, "eval_samples_per_second": 106.682, "eval_steps_per_second": 26.67, "num_input_tokens_seen": 6234920, "step": 32708 }, { "epoch": 17.0010395010395, "grad_norm": 0.18960575759410858, "learning_rate": 3.3482310747393e-06, "loss": 0.1168, "num_input_tokens_seen": 6235336, "step": 32710 }, { "epoch": 17.003638253638254, "grad_norm": 0.9292476773262024, "learning_rate": 3.3425645942481126e-06, "loss": 0.1974, "num_input_tokens_seen": 6236264, "step": 32715 }, { "epoch": 17.006237006237008, "grad_norm": 0.35475605726242065, "learning_rate": 3.336902569207484e-06, "loss": 0.0464, "num_input_tokens_seen": 6237224, "step": 32720 }, { "epoch": 17.008835758835758, "grad_norm": 1.113619089126587, "learning_rate": 3.331245000782221e-06, "loss": 0.1657, "num_input_tokens_seen": 6238184, "step": 32725 }, { "epoch": 17.011434511434512, "grad_norm": 1.3315508365631104, "learning_rate": 3.3255918901362342e-06, "loss": 0.1454, "num_input_tokens_seen": 6239144, "step": 32730 }, { "epoch": 17.014033264033262, "grad_norm": 0.6997853517532349, "learning_rate": 3.3199432384324964e-06, "loss": 0.1595, "num_input_tokens_seen": 6240104, "step": 32735 }, { "epoch": 17.016632016632016, "grad_norm": 0.10470709949731827, "learning_rate": 3.3142990468330636e-06, "loss": 0.1991, "num_input_tokens_seen": 6241000, "step": 32740 }, { "epoch": 17.01923076923077, "grad_norm": 0.20916186273097992, "learning_rate": 3.308659316499094e-06, "loss": 0.0993, "num_input_tokens_seen": 6241928, "step": 32745 }, { "epoch": 17.02182952182952, "grad_norm": 0.16326512396335602, "learning_rate": 3.3030240485908025e-06, "loss": 0.1135, "num_input_tokens_seen": 6242952, "step": 32750 }, { "epoch": 17.024428274428274, "grad_norm": 0.38604071736335754, "learning_rate": 3.297393244267505e-06, "loss": 0.1664, "num_input_tokens_seen": 6243912, "step": 32755 }, { "epoch": 17.027027027027028, "grad_norm": 0.14733678102493286, "learning_rate": 3.2917669046875847e-06, "loss": 0.1327, "num_input_tokens_seen": 6244872, "step": 32760 }, { "epoch": 17.02962577962578, "grad_norm": 0.19675032794475555, "learning_rate": 3.286145031008522e-06, "loss": 0.1171, "num_input_tokens_seen": 6245800, "step": 32765 }, { "epoch": 17.032224532224532, "grad_norm": 1.1403319835662842, "learning_rate": 3.28052762438687e-06, "loss": 0.1435, "num_input_tokens_seen": 6246824, "step": 32770 }, { "epoch": 17.034823284823286, "grad_norm": 0.7885795831680298, "learning_rate": 3.2749146859782587e-06, "loss": 0.0954, "num_input_tokens_seen": 6247816, "step": 32775 }, { "epoch": 17.037422037422036, "grad_norm": 0.6504799127578735, "learning_rate": 3.269306216937404e-06, "loss": 0.1392, "num_input_tokens_seen": 6248776, "step": 32780 }, { "epoch": 17.04002079002079, "grad_norm": 0.8966971039772034, "learning_rate": 3.2637022184181022e-06, "loss": 0.1601, "num_input_tokens_seen": 6249704, "step": 32785 }, { "epoch": 17.042619542619544, "grad_norm": 0.13274751603603363, "learning_rate": 3.258102691573231e-06, "loss": 0.1791, "num_input_tokens_seen": 6250696, "step": 32790 }, { "epoch": 17.045218295218294, "grad_norm": 0.9721534252166748, "learning_rate": 3.2525076375547387e-06, "loss": 0.231, "num_input_tokens_seen": 6251624, "step": 32795 }, { "epoch": 17.04781704781705, "grad_norm": 0.12701556086540222, "learning_rate": 3.246917057513671e-06, "loss": 0.0842, "num_input_tokens_seen": 6252584, "step": 32800 }, { "epoch": 17.050415800415802, "grad_norm": 0.37590906023979187, "learning_rate": 3.2413309526001413e-06, "loss": 0.0403, "num_input_tokens_seen": 6253512, "step": 32805 }, { "epoch": 17.053014553014552, "grad_norm": 0.688671886920929, "learning_rate": 3.235749323963344e-06, "loss": 0.1917, "num_input_tokens_seen": 6254504, "step": 32810 }, { "epoch": 17.055613305613306, "grad_norm": 0.259906142950058, "learning_rate": 3.2301721727515442e-06, "loss": 0.0736, "num_input_tokens_seen": 6255432, "step": 32815 }, { "epoch": 17.058212058212057, "grad_norm": 0.6031084656715393, "learning_rate": 3.2245995001121106e-06, "loss": 0.0658, "num_input_tokens_seen": 6256328, "step": 32820 }, { "epoch": 17.06081081081081, "grad_norm": 0.37863340973854065, "learning_rate": 3.219031307191467e-06, "loss": 0.1309, "num_input_tokens_seen": 6257288, "step": 32825 }, { "epoch": 17.063409563409564, "grad_norm": 0.5205627083778381, "learning_rate": 3.213467595135122e-06, "loss": 0.0707, "num_input_tokens_seen": 6258248, "step": 32830 }, { "epoch": 17.066008316008315, "grad_norm": 0.10259588807821274, "learning_rate": 3.207908365087661e-06, "loss": 0.0815, "num_input_tokens_seen": 6259272, "step": 32835 }, { "epoch": 17.06860706860707, "grad_norm": 0.11808896064758301, "learning_rate": 3.2023536181927632e-06, "loss": 0.0663, "num_input_tokens_seen": 6260168, "step": 32840 }, { "epoch": 17.071205821205822, "grad_norm": 0.49344900250434875, "learning_rate": 3.1968033555931586e-06, "loss": 0.4234, "num_input_tokens_seen": 6261160, "step": 32845 }, { "epoch": 17.073804573804573, "grad_norm": 0.5200952887535095, "learning_rate": 3.1912575784306704e-06, "loss": 0.0953, "num_input_tokens_seen": 6262056, "step": 32850 }, { "epoch": 17.076403326403327, "grad_norm": 1.0796337127685547, "learning_rate": 3.185716287846202e-06, "loss": 0.1524, "num_input_tokens_seen": 6263048, "step": 32855 }, { "epoch": 17.07900207900208, "grad_norm": 0.5910221338272095, "learning_rate": 3.1801794849797285e-06, "loss": 0.0407, "num_input_tokens_seen": 6263944, "step": 32860 }, { "epoch": 17.08160083160083, "grad_norm": 0.26891544461250305, "learning_rate": 3.1746471709702964e-06, "loss": 0.1851, "num_input_tokens_seen": 6264936, "step": 32865 }, { "epoch": 17.084199584199585, "grad_norm": 1.0986757278442383, "learning_rate": 3.1691193469560426e-06, "loss": 0.1014, "num_input_tokens_seen": 6265896, "step": 32870 }, { "epoch": 17.08679833679834, "grad_norm": 0.23777753114700317, "learning_rate": 3.163596014074169e-06, "loss": 0.1383, "num_input_tokens_seen": 6266792, "step": 32875 }, { "epoch": 17.08939708939709, "grad_norm": 0.7154777646064758, "learning_rate": 3.158077173460955e-06, "loss": 0.1869, "num_input_tokens_seen": 6267720, "step": 32880 }, { "epoch": 17.091995841995843, "grad_norm": 1.0448402166366577, "learning_rate": 3.1525628262517536e-06, "loss": 0.0476, "num_input_tokens_seen": 6268648, "step": 32885 }, { "epoch": 17.094594594594593, "grad_norm": 1.398369312286377, "learning_rate": 3.1470529735810077e-06, "loss": 0.1416, "num_input_tokens_seen": 6269640, "step": 32890 }, { "epoch": 17.097193347193347, "grad_norm": 0.19351547956466675, "learning_rate": 3.1415476165822215e-06, "loss": 0.0726, "num_input_tokens_seen": 6270600, "step": 32895 }, { "epoch": 17.0997920997921, "grad_norm": 1.3471243381500244, "learning_rate": 3.1360467563879753e-06, "loss": 0.1795, "num_input_tokens_seen": 6271560, "step": 32900 }, { "epoch": 17.10239085239085, "grad_norm": 0.4314292073249817, "learning_rate": 3.13055039412993e-06, "loss": 0.0518, "num_input_tokens_seen": 6272488, "step": 32905 }, { "epoch": 17.104989604989605, "grad_norm": 0.07123695313930511, "learning_rate": 3.125058530938818e-06, "loss": 0.1865, "num_input_tokens_seen": 6273384, "step": 32910 }, { "epoch": 17.10758835758836, "grad_norm": 0.202358677983284, "learning_rate": 3.1195711679444433e-06, "loss": 0.116, "num_input_tokens_seen": 6274344, "step": 32915 }, { "epoch": 17.11018711018711, "grad_norm": 0.7794880867004395, "learning_rate": 3.1140883062756835e-06, "loss": 0.093, "num_input_tokens_seen": 6275304, "step": 32920 }, { "epoch": 17.112785862785863, "grad_norm": 1.34109628200531, "learning_rate": 3.108609947060506e-06, "loss": 0.1295, "num_input_tokens_seen": 6276200, "step": 32925 }, { "epoch": 17.115384615384617, "grad_norm": 0.2445794641971588, "learning_rate": 3.1031360914259345e-06, "loss": 0.1506, "num_input_tokens_seen": 6277160, "step": 32930 }, { "epoch": 17.117983367983367, "grad_norm": 1.1560420989990234, "learning_rate": 3.0976667404980703e-06, "loss": 0.1139, "num_input_tokens_seen": 6278152, "step": 32935 }, { "epoch": 17.12058212058212, "grad_norm": 0.9266790151596069, "learning_rate": 3.092201895402083e-06, "loss": 0.1256, "num_input_tokens_seen": 6279112, "step": 32940 }, { "epoch": 17.123180873180875, "grad_norm": 1.5009549856185913, "learning_rate": 3.086741557262235e-06, "loss": 0.1428, "num_input_tokens_seen": 6280104, "step": 32945 }, { "epoch": 17.125779625779625, "grad_norm": 0.6559979319572449, "learning_rate": 3.081285727201841e-06, "loss": 0.1229, "num_input_tokens_seen": 6281000, "step": 32950 }, { "epoch": 17.12837837837838, "grad_norm": 0.7660545110702515, "learning_rate": 3.0758344063432922e-06, "loss": 0.22, "num_input_tokens_seen": 6282024, "step": 32955 }, { "epoch": 17.13097713097713, "grad_norm": 0.23280781507492065, "learning_rate": 3.0703875958080553e-06, "loss": 0.096, "num_input_tokens_seen": 6282920, "step": 32960 }, { "epoch": 17.133575883575883, "grad_norm": 2.270233154296875, "learning_rate": 3.0649452967166785e-06, "loss": 0.1316, "num_input_tokens_seen": 6283912, "step": 32965 }, { "epoch": 17.136174636174637, "grad_norm": 1.5510435104370117, "learning_rate": 3.0595075101887694e-06, "loss": 0.1558, "num_input_tokens_seen": 6284872, "step": 32970 }, { "epoch": 17.138773388773387, "grad_norm": 0.3479997217655182, "learning_rate": 3.0540742373429964e-06, "loss": 0.1461, "num_input_tokens_seen": 6285832, "step": 32975 }, { "epoch": 17.14137214137214, "grad_norm": 0.23599275946617126, "learning_rate": 3.048645479297127e-06, "loss": 0.1376, "num_input_tokens_seen": 6286792, "step": 32980 }, { "epoch": 17.143970893970895, "grad_norm": 1.104630947113037, "learning_rate": 3.0432212371679867e-06, "loss": 0.1536, "num_input_tokens_seen": 6287816, "step": 32985 }, { "epoch": 17.146569646569645, "grad_norm": 0.6906998157501221, "learning_rate": 3.0378015120714586e-06, "loss": 0.0974, "num_input_tokens_seen": 6288776, "step": 32990 }, { "epoch": 17.1491683991684, "grad_norm": 0.5060268044471741, "learning_rate": 3.0323863051225256e-06, "loss": 0.1714, "num_input_tokens_seen": 6289640, "step": 32995 }, { "epoch": 17.151767151767153, "grad_norm": 1.0155372619628906, "learning_rate": 3.0269756174352152e-06, "loss": 0.0774, "num_input_tokens_seen": 6290600, "step": 33000 }, { "epoch": 17.154365904365903, "grad_norm": 0.7728695273399353, "learning_rate": 3.0215694501226384e-06, "loss": 0.181, "num_input_tokens_seen": 6291592, "step": 33005 }, { "epoch": 17.156964656964657, "grad_norm": 0.5191823244094849, "learning_rate": 3.0161678042969627e-06, "loss": 0.1515, "num_input_tokens_seen": 6292552, "step": 33010 }, { "epoch": 17.15956340956341, "grad_norm": 1.6668015718460083, "learning_rate": 3.0107706810694523e-06, "loss": 0.1056, "num_input_tokens_seen": 6293512, "step": 33015 }, { "epoch": 17.16216216216216, "grad_norm": 0.1804957091808319, "learning_rate": 3.005378081550414e-06, "loss": 0.1067, "num_input_tokens_seen": 6294536, "step": 33020 }, { "epoch": 17.164760914760915, "grad_norm": 0.35169631242752075, "learning_rate": 2.9999900068492374e-06, "loss": 0.1216, "num_input_tokens_seen": 6295432, "step": 33025 }, { "epoch": 17.16735966735967, "grad_norm": 0.9832718968391418, "learning_rate": 2.994606458074367e-06, "loss": 0.2168, "num_input_tokens_seen": 6296392, "step": 33030 }, { "epoch": 17.16995841995842, "grad_norm": 0.7886142730712891, "learning_rate": 2.98922743633335e-06, "loss": 0.1081, "num_input_tokens_seen": 6297320, "step": 33035 }, { "epoch": 17.172557172557173, "grad_norm": 0.8624871373176575, "learning_rate": 2.9838529427327594e-06, "loss": 0.1069, "num_input_tokens_seen": 6298312, "step": 33040 }, { "epoch": 17.175155925155924, "grad_norm": 1.7673594951629639, "learning_rate": 2.978482978378258e-06, "loss": 0.1861, "num_input_tokens_seen": 6299272, "step": 33045 }, { "epoch": 17.177754677754677, "grad_norm": 0.3357095420360565, "learning_rate": 2.9731175443745818e-06, "loss": 0.0851, "num_input_tokens_seen": 6300232, "step": 33050 }, { "epoch": 17.18035343035343, "grad_norm": 1.9601327180862427, "learning_rate": 2.967756641825531e-06, "loss": 0.1556, "num_input_tokens_seen": 6301192, "step": 33055 }, { "epoch": 17.18295218295218, "grad_norm": 0.6908663511276245, "learning_rate": 2.962400271833965e-06, "loss": 0.0775, "num_input_tokens_seen": 6302216, "step": 33060 }, { "epoch": 17.185550935550935, "grad_norm": 1.0469478368759155, "learning_rate": 2.957048435501811e-06, "loss": 0.1791, "num_input_tokens_seen": 6303144, "step": 33065 }, { "epoch": 17.18814968814969, "grad_norm": 0.6558203101158142, "learning_rate": 2.9517011339300848e-06, "loss": 0.1199, "num_input_tokens_seen": 6304040, "step": 33070 }, { "epoch": 17.19074844074844, "grad_norm": 0.7472519874572754, "learning_rate": 2.9463583682188433e-06, "loss": 0.2419, "num_input_tokens_seen": 6305000, "step": 33075 }, { "epoch": 17.193347193347194, "grad_norm": 0.2662537395954132, "learning_rate": 2.9410201394672265e-06, "loss": 0.0931, "num_input_tokens_seen": 6305992, "step": 33080 }, { "epoch": 17.195945945945947, "grad_norm": 0.7522170543670654, "learning_rate": 2.9356864487734254e-06, "loss": 0.0903, "num_input_tokens_seen": 6306888, "step": 33085 }, { "epoch": 17.198544698544698, "grad_norm": 0.47050052881240845, "learning_rate": 2.930357297234718e-06, "loss": 0.1108, "num_input_tokens_seen": 6307880, "step": 33090 }, { "epoch": 17.20114345114345, "grad_norm": 0.39025264978408813, "learning_rate": 2.925032685947432e-06, "loss": 0.1373, "num_input_tokens_seen": 6308840, "step": 33095 }, { "epoch": 17.203742203742205, "grad_norm": 0.4261513352394104, "learning_rate": 2.91971261600697e-06, "loss": 0.0943, "num_input_tokens_seen": 6309864, "step": 33100 }, { "epoch": 17.206340956340956, "grad_norm": 0.8380669355392456, "learning_rate": 2.914397088507795e-06, "loss": 0.151, "num_input_tokens_seen": 6310824, "step": 33105 }, { "epoch": 17.20893970893971, "grad_norm": 1.0815058946609497, "learning_rate": 2.9090861045434405e-06, "loss": 0.1461, "num_input_tokens_seen": 6311784, "step": 33110 }, { "epoch": 17.21153846153846, "grad_norm": 1.5376051664352417, "learning_rate": 2.903779665206491e-06, "loss": 0.1885, "num_input_tokens_seen": 6312712, "step": 33115 }, { "epoch": 17.214137214137214, "grad_norm": 0.3518892526626587, "learning_rate": 2.898477771588626e-06, "loss": 0.0528, "num_input_tokens_seen": 6313672, "step": 33120 }, { "epoch": 17.216735966735968, "grad_norm": 0.8328110575675964, "learning_rate": 2.893180424780559e-06, "loss": 0.0765, "num_input_tokens_seen": 6314600, "step": 33125 }, { "epoch": 17.219334719334718, "grad_norm": 0.15005680918693542, "learning_rate": 2.887887625872085e-06, "loss": 0.0573, "num_input_tokens_seen": 6315496, "step": 33130 }, { "epoch": 17.221933471933472, "grad_norm": 0.9624289274215698, "learning_rate": 2.88259937595205e-06, "loss": 0.1907, "num_input_tokens_seen": 6316456, "step": 33135 }, { "epoch": 17.224532224532226, "grad_norm": 0.2800530791282654, "learning_rate": 2.877315676108386e-06, "loss": 0.1397, "num_input_tokens_seen": 6317352, "step": 33140 }, { "epoch": 17.227130977130976, "grad_norm": 1.230928659439087, "learning_rate": 2.8720365274280738e-06, "loss": 0.215, "num_input_tokens_seen": 6318312, "step": 33145 }, { "epoch": 17.22972972972973, "grad_norm": 0.6663100719451904, "learning_rate": 2.8667619309971526e-06, "loss": 0.1588, "num_input_tokens_seen": 6319176, "step": 33150 }, { "epoch": 17.232328482328484, "grad_norm": 0.5793545246124268, "learning_rate": 2.861491887900733e-06, "loss": 0.0841, "num_input_tokens_seen": 6320104, "step": 33155 }, { "epoch": 17.234927234927234, "grad_norm": 0.09769666939973831, "learning_rate": 2.856226399222997e-06, "loss": 0.0531, "num_input_tokens_seen": 6321128, "step": 33160 }, { "epoch": 17.237525987525988, "grad_norm": 0.3813337981700897, "learning_rate": 2.850965466047181e-06, "loss": 0.0998, "num_input_tokens_seen": 6322088, "step": 33165 }, { "epoch": 17.24012474012474, "grad_norm": 0.22272174060344696, "learning_rate": 2.84570908945557e-06, "loss": 0.0766, "num_input_tokens_seen": 6323080, "step": 33170 }, { "epoch": 17.242723492723492, "grad_norm": 0.8688036203384399, "learning_rate": 2.8404572705295395e-06, "loss": 0.0813, "num_input_tokens_seen": 6324040, "step": 33175 }, { "epoch": 17.245322245322246, "grad_norm": 1.8624844551086426, "learning_rate": 2.835210010349512e-06, "loss": 0.0966, "num_input_tokens_seen": 6325064, "step": 33180 }, { "epoch": 17.247920997920996, "grad_norm": 1.0620416402816772, "learning_rate": 2.8299673099949684e-06, "loss": 0.1527, "num_input_tokens_seen": 6326024, "step": 33185 }, { "epoch": 17.25051975051975, "grad_norm": 0.5863646864891052, "learning_rate": 2.8247291705444575e-06, "loss": 0.1719, "num_input_tokens_seen": 6326952, "step": 33190 }, { "epoch": 17.253118503118504, "grad_norm": 1.2742961645126343, "learning_rate": 2.8194955930755967e-06, "loss": 0.1469, "num_input_tokens_seen": 6327912, "step": 33195 }, { "epoch": 17.255717255717254, "grad_norm": 1.0515488386154175, "learning_rate": 2.8142665786650534e-06, "loss": 0.1419, "num_input_tokens_seen": 6328904, "step": 33200 }, { "epoch": 17.258316008316008, "grad_norm": 1.286027431488037, "learning_rate": 2.8090421283885586e-06, "loss": 0.1807, "num_input_tokens_seen": 6329832, "step": 33205 }, { "epoch": 17.260914760914762, "grad_norm": 0.16431362926959991, "learning_rate": 2.8038222433209057e-06, "loss": 0.0774, "num_input_tokens_seen": 6330728, "step": 33210 }, { "epoch": 17.263513513513512, "grad_norm": 0.09865481406450272, "learning_rate": 2.798606924535954e-06, "loss": 0.0908, "num_input_tokens_seen": 6331656, "step": 33215 }, { "epoch": 17.266112266112266, "grad_norm": 0.23094281554222107, "learning_rate": 2.793396173106619e-06, "loss": 0.1604, "num_input_tokens_seen": 6332616, "step": 33220 }, { "epoch": 17.26871101871102, "grad_norm": 0.5273774266242981, "learning_rate": 2.788189990104867e-06, "loss": 0.1427, "num_input_tokens_seen": 6333576, "step": 33225 }, { "epoch": 17.27130977130977, "grad_norm": 0.575431764125824, "learning_rate": 2.78298837660175e-06, "loss": 0.1382, "num_input_tokens_seen": 6334600, "step": 33230 }, { "epoch": 17.273908523908524, "grad_norm": 0.15626856684684753, "learning_rate": 2.777791333667351e-06, "loss": 0.0945, "num_input_tokens_seen": 6335560, "step": 33235 }, { "epoch": 17.276507276507278, "grad_norm": 0.6948143839836121, "learning_rate": 2.772598862370826e-06, "loss": 0.224, "num_input_tokens_seen": 6336456, "step": 33240 }, { "epoch": 17.27910602910603, "grad_norm": 1.3322023153305054, "learning_rate": 2.767410963780395e-06, "loss": 0.0881, "num_input_tokens_seen": 6337448, "step": 33245 }, { "epoch": 17.281704781704782, "grad_norm": 0.6429280638694763, "learning_rate": 2.7622276389633328e-06, "loss": 0.1471, "num_input_tokens_seen": 6338472, "step": 33250 }, { "epoch": 17.284303534303533, "grad_norm": 0.5688532590866089, "learning_rate": 2.757048888985969e-06, "loss": 0.2124, "num_input_tokens_seen": 6339432, "step": 33255 }, { "epoch": 17.286902286902286, "grad_norm": 0.06843923777341843, "learning_rate": 2.751874714913694e-06, "loss": 0.0359, "num_input_tokens_seen": 6340392, "step": 33260 }, { "epoch": 17.28950103950104, "grad_norm": 0.3730420470237732, "learning_rate": 2.7467051178109685e-06, "loss": 0.0786, "num_input_tokens_seen": 6341384, "step": 33265 }, { "epoch": 17.29209979209979, "grad_norm": 0.8022746443748474, "learning_rate": 2.7415400987412935e-06, "loss": 0.168, "num_input_tokens_seen": 6342248, "step": 33270 }, { "epoch": 17.294698544698544, "grad_norm": 1.3171117305755615, "learning_rate": 2.736379658767238e-06, "loss": 0.1015, "num_input_tokens_seen": 6343208, "step": 33275 }, { "epoch": 17.2972972972973, "grad_norm": 0.7300923466682434, "learning_rate": 2.7312237989504247e-06, "loss": 0.1734, "num_input_tokens_seen": 6344136, "step": 33280 }, { "epoch": 17.29989604989605, "grad_norm": 1.1988980770111084, "learning_rate": 2.726072520351544e-06, "loss": 0.0849, "num_input_tokens_seen": 6345096, "step": 33285 }, { "epoch": 17.302494802494802, "grad_norm": 1.3597053289413452, "learning_rate": 2.720925824030335e-06, "loss": 0.1857, "num_input_tokens_seen": 6346120, "step": 33290 }, { "epoch": 17.305093555093556, "grad_norm": 0.63080894947052, "learning_rate": 2.7157837110455913e-06, "loss": 0.2038, "num_input_tokens_seen": 6347080, "step": 33295 }, { "epoch": 17.307692307692307, "grad_norm": 0.18605583906173706, "learning_rate": 2.710646182455173e-06, "loss": 0.0738, "num_input_tokens_seen": 6348008, "step": 33300 }, { "epoch": 17.31029106029106, "grad_norm": 0.46693676710128784, "learning_rate": 2.705513239315988e-06, "loss": 0.0776, "num_input_tokens_seen": 6349032, "step": 33305 }, { "epoch": 17.312889812889814, "grad_norm": 1.6466394662857056, "learning_rate": 2.7003848826840067e-06, "loss": 0.1583, "num_input_tokens_seen": 6349960, "step": 33310 }, { "epoch": 17.315488565488565, "grad_norm": 0.6591888666152954, "learning_rate": 2.6952611136142477e-06, "loss": 0.1293, "num_input_tokens_seen": 6350856, "step": 33315 }, { "epoch": 17.31808731808732, "grad_norm": 1.1408240795135498, "learning_rate": 2.6901419331608057e-06, "loss": 0.1131, "num_input_tokens_seen": 6351912, "step": 33320 }, { "epoch": 17.320686070686072, "grad_norm": 1.1720973253250122, "learning_rate": 2.6850273423768115e-06, "loss": 0.1406, "num_input_tokens_seen": 6352904, "step": 33325 }, { "epoch": 17.323284823284823, "grad_norm": 2.7725958824157715, "learning_rate": 2.6799173423144584e-06, "loss": 0.1159, "num_input_tokens_seen": 6353864, "step": 33330 }, { "epoch": 17.325883575883577, "grad_norm": 0.899341344833374, "learning_rate": 2.67481193402499e-06, "loss": 0.1723, "num_input_tokens_seen": 6354792, "step": 33335 }, { "epoch": 17.328482328482327, "grad_norm": 0.5580780506134033, "learning_rate": 2.669711118558718e-06, "loss": 0.1511, "num_input_tokens_seen": 6355816, "step": 33340 }, { "epoch": 17.33108108108108, "grad_norm": 1.3721764087677002, "learning_rate": 2.664614896965001e-06, "loss": 0.2352, "num_input_tokens_seen": 6356840, "step": 33345 }, { "epoch": 17.333679833679835, "grad_norm": 0.8400031924247742, "learning_rate": 2.659523270292244e-06, "loss": 0.1009, "num_input_tokens_seen": 6357800, "step": 33350 }, { "epoch": 17.336278586278585, "grad_norm": 0.3087671995162964, "learning_rate": 2.6544362395879317e-06, "loss": 0.1423, "num_input_tokens_seen": 6358728, "step": 33355 }, { "epoch": 17.33887733887734, "grad_norm": 0.3791445791721344, "learning_rate": 2.6493538058985775e-06, "loss": 0.0572, "num_input_tokens_seen": 6359720, "step": 33360 }, { "epoch": 17.341476091476093, "grad_norm": 0.49409323930740356, "learning_rate": 2.6442759702697606e-06, "loss": 0.1425, "num_input_tokens_seen": 6360584, "step": 33365 }, { "epoch": 17.344074844074843, "grad_norm": 1.1565027236938477, "learning_rate": 2.639202733746113e-06, "loss": 0.0865, "num_input_tokens_seen": 6361576, "step": 33370 }, { "epoch": 17.346673596673597, "grad_norm": 0.1821756213903427, "learning_rate": 2.6341340973713187e-06, "loss": 0.0978, "num_input_tokens_seen": 6362504, "step": 33375 }, { "epoch": 17.34927234927235, "grad_norm": 0.7994769215583801, "learning_rate": 2.629070062188119e-06, "loss": 0.0318, "num_input_tokens_seen": 6363400, "step": 33380 }, { "epoch": 17.3518711018711, "grad_norm": 0.23650069534778595, "learning_rate": 2.6240106292383022e-06, "loss": 0.0999, "num_input_tokens_seen": 6364328, "step": 33385 }, { "epoch": 17.354469854469855, "grad_norm": 0.09372654557228088, "learning_rate": 2.618955799562722e-06, "loss": 0.13, "num_input_tokens_seen": 6365288, "step": 33390 }, { "epoch": 17.35706860706861, "grad_norm": 0.678342342376709, "learning_rate": 2.613905574201275e-06, "loss": 0.1355, "num_input_tokens_seen": 6366280, "step": 33395 }, { "epoch": 17.35966735966736, "grad_norm": 0.12663066387176514, "learning_rate": 2.6088599541929104e-06, "loss": 0.072, "num_input_tokens_seen": 6367240, "step": 33400 }, { "epoch": 17.362266112266113, "grad_norm": 0.5117924809455872, "learning_rate": 2.603818940575628e-06, "loss": 0.2421, "num_input_tokens_seen": 6368168, "step": 33405 }, { "epoch": 17.364864864864863, "grad_norm": 0.3004331886768341, "learning_rate": 2.5987825343864956e-06, "loss": 0.085, "num_input_tokens_seen": 6369128, "step": 33410 }, { "epoch": 17.367463617463617, "grad_norm": 1.072529911994934, "learning_rate": 2.5937507366616154e-06, "loss": 0.1869, "num_input_tokens_seen": 6370056, "step": 33415 }, { "epoch": 17.37006237006237, "grad_norm": 0.11089766770601273, "learning_rate": 2.5887235484361506e-06, "loss": 0.0808, "num_input_tokens_seen": 6371080, "step": 33420 }, { "epoch": 17.37266112266112, "grad_norm": 0.3768017590045929, "learning_rate": 2.5837009707443042e-06, "loss": 0.1231, "num_input_tokens_seen": 6372072, "step": 33425 }, { "epoch": 17.375259875259875, "grad_norm": 1.0091898441314697, "learning_rate": 2.5786830046193587e-06, "loss": 0.1329, "num_input_tokens_seen": 6373064, "step": 33430 }, { "epoch": 17.37785862785863, "grad_norm": 0.7257794737815857, "learning_rate": 2.573669651093616e-06, "loss": 0.1675, "num_input_tokens_seen": 6373992, "step": 33435 }, { "epoch": 17.38045738045738, "grad_norm": 0.8129843473434448, "learning_rate": 2.5686609111984368e-06, "loss": 0.1652, "num_input_tokens_seen": 6374952, "step": 33440 }, { "epoch": 17.383056133056133, "grad_norm": 0.22372636198997498, "learning_rate": 2.5636567859642525e-06, "loss": 0.1347, "num_input_tokens_seen": 6375944, "step": 33445 }, { "epoch": 17.385654885654887, "grad_norm": 0.9724411368370056, "learning_rate": 2.558657276420526e-06, "loss": 0.2083, "num_input_tokens_seen": 6376872, "step": 33450 }, { "epoch": 17.388253638253637, "grad_norm": 0.5104211568832397, "learning_rate": 2.5536623835957727e-06, "loss": 0.09, "num_input_tokens_seen": 6377864, "step": 33455 }, { "epoch": 17.39085239085239, "grad_norm": 0.5016099214553833, "learning_rate": 2.548672108517558e-06, "loss": 0.1319, "num_input_tokens_seen": 6378760, "step": 33460 }, { "epoch": 17.393451143451145, "grad_norm": 0.12656496465206146, "learning_rate": 2.54368645221251e-06, "loss": 0.1061, "num_input_tokens_seen": 6379656, "step": 33465 }, { "epoch": 17.396049896049895, "grad_norm": 0.951294481754303, "learning_rate": 2.538705415706291e-06, "loss": 0.1478, "num_input_tokens_seen": 6380584, "step": 33470 }, { "epoch": 17.39864864864865, "grad_norm": 1.3919315338134766, "learning_rate": 2.533729000023616e-06, "loss": 0.2079, "num_input_tokens_seen": 6381512, "step": 33475 }, { "epoch": 17.401247401247403, "grad_norm": 1.0678362846374512, "learning_rate": 2.5287572061882618e-06, "loss": 0.1044, "num_input_tokens_seen": 6382504, "step": 33480 }, { "epoch": 17.403846153846153, "grad_norm": 0.7460529804229736, "learning_rate": 2.5237900352230393e-06, "loss": 0.2152, "num_input_tokens_seen": 6383496, "step": 33485 }, { "epoch": 17.406444906444907, "grad_norm": 1.2914477586746216, "learning_rate": 2.5188274881498136e-06, "loss": 0.1351, "num_input_tokens_seen": 6384424, "step": 33490 }, { "epoch": 17.409043659043657, "grad_norm": 0.41584065556526184, "learning_rate": 2.513869565989499e-06, "loss": 0.1929, "num_input_tokens_seen": 6385384, "step": 33495 }, { "epoch": 17.41164241164241, "grad_norm": 0.8231315016746521, "learning_rate": 2.508916269762057e-06, "loss": 0.1064, "num_input_tokens_seen": 6386312, "step": 33500 }, { "epoch": 17.414241164241165, "grad_norm": 0.7832927107810974, "learning_rate": 2.503967600486504e-06, "loss": 0.0937, "num_input_tokens_seen": 6387240, "step": 33505 }, { "epoch": 17.416839916839916, "grad_norm": 1.0397062301635742, "learning_rate": 2.499023559180888e-06, "loss": 0.2095, "num_input_tokens_seen": 6388136, "step": 33510 }, { "epoch": 17.41943866943867, "grad_norm": 1.0753883123397827, "learning_rate": 2.4940841468623272e-06, "loss": 0.1336, "num_input_tokens_seen": 6389000, "step": 33515 }, { "epoch": 17.422037422037423, "grad_norm": 1.5457521677017212, "learning_rate": 2.489149364546975e-06, "loss": 0.0941, "num_input_tokens_seen": 6389960, "step": 33520 }, { "epoch": 17.424636174636174, "grad_norm": 0.911537766456604, "learning_rate": 2.48421921325003e-06, "loss": 0.0919, "num_input_tokens_seen": 6390952, "step": 33525 }, { "epoch": 17.427234927234927, "grad_norm": 0.729286789894104, "learning_rate": 2.47929369398574e-06, "loss": 0.1158, "num_input_tokens_seen": 6391912, "step": 33530 }, { "epoch": 17.42983367983368, "grad_norm": 0.3495543599128723, "learning_rate": 2.474372807767408e-06, "loss": 0.0853, "num_input_tokens_seen": 6392808, "step": 33535 }, { "epoch": 17.43243243243243, "grad_norm": 0.13271212577819824, "learning_rate": 2.4694565556073746e-06, "loss": 0.0404, "num_input_tokens_seen": 6393704, "step": 33540 }, { "epoch": 17.435031185031185, "grad_norm": 1.7521990537643433, "learning_rate": 2.46454493851703e-06, "loss": 0.1636, "num_input_tokens_seen": 6394696, "step": 33545 }, { "epoch": 17.43762993762994, "grad_norm": 0.2844812572002411, "learning_rate": 2.459637957506808e-06, "loss": 0.0661, "num_input_tokens_seen": 6395592, "step": 33550 }, { "epoch": 17.44022869022869, "grad_norm": 1.7481633424758911, "learning_rate": 2.4547356135861954e-06, "loss": 0.0871, "num_input_tokens_seen": 6396520, "step": 33555 }, { "epoch": 17.442827442827443, "grad_norm": 0.3040158748626709, "learning_rate": 2.4498379077637275e-06, "loss": 0.103, "num_input_tokens_seen": 6397544, "step": 33560 }, { "epoch": 17.445426195426194, "grad_norm": 1.525252342224121, "learning_rate": 2.4449448410469616e-06, "loss": 0.2517, "num_input_tokens_seen": 6398504, "step": 33565 }, { "epoch": 17.448024948024948, "grad_norm": 1.4066284894943237, "learning_rate": 2.440056414442532e-06, "loss": 0.1719, "num_input_tokens_seen": 6399464, "step": 33570 }, { "epoch": 17.4506237006237, "grad_norm": 0.1288827359676361, "learning_rate": 2.4351726289561035e-06, "loss": 0.0506, "num_input_tokens_seen": 6400392, "step": 33575 }, { "epoch": 17.453222453222452, "grad_norm": 0.44072678685188293, "learning_rate": 2.4302934855923804e-06, "loss": 0.0402, "num_input_tokens_seen": 6401384, "step": 33580 }, { "epoch": 17.455821205821206, "grad_norm": 0.5797369480133057, "learning_rate": 2.4254189853551214e-06, "loss": 0.0905, "num_input_tokens_seen": 6402344, "step": 33585 }, { "epoch": 17.45841995841996, "grad_norm": 0.6783599853515625, "learning_rate": 2.420549129247132e-06, "loss": 0.0804, "num_input_tokens_seen": 6403208, "step": 33590 }, { "epoch": 17.46101871101871, "grad_norm": 0.557202935218811, "learning_rate": 2.4156839182702534e-06, "loss": 0.09, "num_input_tokens_seen": 6404168, "step": 33595 }, { "epoch": 17.463617463617464, "grad_norm": 0.230004221200943, "learning_rate": 2.4108233534253734e-06, "loss": 0.1519, "num_input_tokens_seen": 6405288, "step": 33600 }, { "epoch": 17.466216216216218, "grad_norm": 0.13431423902511597, "learning_rate": 2.4059674357124335e-06, "loss": 0.216, "num_input_tokens_seen": 6406216, "step": 33605 }, { "epoch": 17.468814968814968, "grad_norm": 0.3063478469848633, "learning_rate": 2.4011161661304072e-06, "loss": 0.1062, "num_input_tokens_seen": 6407176, "step": 33610 }, { "epoch": 17.47141372141372, "grad_norm": 0.27255645394325256, "learning_rate": 2.3962695456773153e-06, "loss": 0.096, "num_input_tokens_seen": 6408136, "step": 33615 }, { "epoch": 17.474012474012476, "grad_norm": 0.47410792112350464, "learning_rate": 2.391427575350219e-06, "loss": 0.0812, "num_input_tokens_seen": 6409096, "step": 33620 }, { "epoch": 17.476611226611226, "grad_norm": 0.7632502913475037, "learning_rate": 2.3865902561452437e-06, "loss": 0.2186, "num_input_tokens_seen": 6410056, "step": 33625 }, { "epoch": 17.47920997920998, "grad_norm": 0.39860743284225464, "learning_rate": 2.3817575890575233e-06, "loss": 0.1674, "num_input_tokens_seen": 6411048, "step": 33630 }, { "epoch": 17.48180873180873, "grad_norm": 0.3017159104347229, "learning_rate": 2.376929575081252e-06, "loss": 0.1668, "num_input_tokens_seen": 6412008, "step": 33635 }, { "epoch": 17.484407484407484, "grad_norm": 0.06260491907596588, "learning_rate": 2.3721062152096825e-06, "loss": 0.0569, "num_input_tokens_seen": 6412936, "step": 33640 }, { "epoch": 17.487006237006238, "grad_norm": 0.16018922626972198, "learning_rate": 2.3672875104350844e-06, "loss": 0.1625, "num_input_tokens_seen": 6413928, "step": 33645 }, { "epoch": 17.489604989604988, "grad_norm": 0.10476154088973999, "learning_rate": 2.3624734617487848e-06, "loss": 0.0588, "num_input_tokens_seen": 6414856, "step": 33650 }, { "epoch": 17.492203742203742, "grad_norm": 0.9055771827697754, "learning_rate": 2.3576640701411383e-06, "loss": 0.1275, "num_input_tokens_seen": 6415880, "step": 33655 }, { "epoch": 17.494802494802496, "grad_norm": 0.29334205389022827, "learning_rate": 2.352859336601565e-06, "loss": 0.1164, "num_input_tokens_seen": 6416840, "step": 33660 }, { "epoch": 17.497401247401246, "grad_norm": 0.860845685005188, "learning_rate": 2.348059262118507e-06, "loss": 0.0898, "num_input_tokens_seen": 6417800, "step": 33665 }, { "epoch": 17.5, "grad_norm": 0.3444586396217346, "learning_rate": 2.343263847679453e-06, "loss": 0.0488, "num_input_tokens_seen": 6418696, "step": 33670 }, { "epoch": 17.502598752598754, "grad_norm": 0.5905164480209351, "learning_rate": 2.338473094270929e-06, "loss": 0.0396, "num_input_tokens_seen": 6419592, "step": 33675 }, { "epoch": 17.505197505197504, "grad_norm": 0.32634133100509644, "learning_rate": 2.333687002878521e-06, "loss": 0.0831, "num_input_tokens_seen": 6420584, "step": 33680 }, { "epoch": 17.507796257796258, "grad_norm": 1.1792107820510864, "learning_rate": 2.328905574486831e-06, "loss": 0.1663, "num_input_tokens_seen": 6421576, "step": 33685 }, { "epoch": 17.510395010395012, "grad_norm": 0.07949206233024597, "learning_rate": 2.3241288100795178e-06, "loss": 0.2633, "num_input_tokens_seen": 6422504, "step": 33690 }, { "epoch": 17.512993762993762, "grad_norm": 0.5339831709861755, "learning_rate": 2.3193567106392728e-06, "loss": 0.1577, "num_input_tokens_seen": 6423464, "step": 33695 }, { "epoch": 17.515592515592516, "grad_norm": 1.0310068130493164, "learning_rate": 2.31458927714783e-06, "loss": 0.1435, "num_input_tokens_seen": 6424360, "step": 33700 }, { "epoch": 17.518191268191266, "grad_norm": 0.08310423791408539, "learning_rate": 2.309826510585966e-06, "loss": 0.0858, "num_input_tokens_seen": 6425288, "step": 33705 }, { "epoch": 17.52079002079002, "grad_norm": 0.3107556402683258, "learning_rate": 2.305068411933492e-06, "loss": 0.1473, "num_input_tokens_seen": 6426280, "step": 33710 }, { "epoch": 17.523388773388774, "grad_norm": 1.5607929229736328, "learning_rate": 2.3003149821692687e-06, "loss": 0.101, "num_input_tokens_seen": 6427272, "step": 33715 }, { "epoch": 17.525987525987524, "grad_norm": 0.5675970911979675, "learning_rate": 2.295566222271189e-06, "loss": 0.1795, "num_input_tokens_seen": 6428232, "step": 33720 }, { "epoch": 17.52858627858628, "grad_norm": 0.06912980228662491, "learning_rate": 2.290822133216175e-06, "loss": 0.0402, "num_input_tokens_seen": 6429192, "step": 33725 }, { "epoch": 17.531185031185032, "grad_norm": 1.0439190864562988, "learning_rate": 2.2860827159802146e-06, "loss": 0.0848, "num_input_tokens_seen": 6430184, "step": 33730 }, { "epoch": 17.533783783783782, "grad_norm": 0.9558357000350952, "learning_rate": 2.281347971538311e-06, "loss": 0.1429, "num_input_tokens_seen": 6431112, "step": 33735 }, { "epoch": 17.536382536382536, "grad_norm": 0.22262944281101227, "learning_rate": 2.276617900864514e-06, "loss": 0.0966, "num_input_tokens_seen": 6432008, "step": 33740 }, { "epoch": 17.53898128898129, "grad_norm": 0.31018906831741333, "learning_rate": 2.271892504931905e-06, "loss": 0.0386, "num_input_tokens_seen": 6432968, "step": 33745 }, { "epoch": 17.54158004158004, "grad_norm": 0.3284442126750946, "learning_rate": 2.267171784712627e-06, "loss": 0.1087, "num_input_tokens_seen": 6433928, "step": 33750 }, { "epoch": 17.544178794178794, "grad_norm": 0.6435651779174805, "learning_rate": 2.262455741177838e-06, "loss": 0.264, "num_input_tokens_seen": 6434952, "step": 33755 }, { "epoch": 17.546777546777548, "grad_norm": 0.37062665820121765, "learning_rate": 2.2577443752977297e-06, "loss": 0.1126, "num_input_tokens_seen": 6435944, "step": 33760 }, { "epoch": 17.5493762993763, "grad_norm": 0.6869094967842102, "learning_rate": 2.253037688041554e-06, "loss": 0.153, "num_input_tokens_seen": 6436936, "step": 33765 }, { "epoch": 17.551975051975052, "grad_norm": 1.0296120643615723, "learning_rate": 2.248335680377586e-06, "loss": 0.1818, "num_input_tokens_seen": 6437896, "step": 33770 }, { "epoch": 17.554573804573806, "grad_norm": 1.0512220859527588, "learning_rate": 2.243638353273139e-06, "loss": 0.1119, "num_input_tokens_seen": 6438984, "step": 33775 }, { "epoch": 17.557172557172557, "grad_norm": 0.7686329483985901, "learning_rate": 2.238945707694562e-06, "loss": 0.1278, "num_input_tokens_seen": 6439944, "step": 33780 }, { "epoch": 17.55977130977131, "grad_norm": 0.2932021915912628, "learning_rate": 2.2342577446072544e-06, "loss": 0.1278, "num_input_tokens_seen": 6440968, "step": 33785 }, { "epoch": 17.56237006237006, "grad_norm": 0.5622022747993469, "learning_rate": 2.229574464975634e-06, "loss": 0.111, "num_input_tokens_seen": 6441960, "step": 33790 }, { "epoch": 17.564968814968815, "grad_norm": 1.5143795013427734, "learning_rate": 2.2248958697631634e-06, "loss": 0.1066, "num_input_tokens_seen": 6442792, "step": 33795 }, { "epoch": 17.56756756756757, "grad_norm": 0.8981793522834778, "learning_rate": 2.2202219599323394e-06, "loss": 0.2212, "num_input_tokens_seen": 6443688, "step": 33800 }, { "epoch": 17.57016632016632, "grad_norm": 0.06933899223804474, "learning_rate": 2.2155527364447064e-06, "loss": 0.105, "num_input_tokens_seen": 6444712, "step": 33805 }, { "epoch": 17.572765072765073, "grad_norm": 0.8304980397224426, "learning_rate": 2.2108882002608246e-06, "loss": 0.1033, "num_input_tokens_seen": 6445704, "step": 33810 }, { "epoch": 17.575363825363826, "grad_norm": 1.3915332555770874, "learning_rate": 2.206228352340306e-06, "loss": 0.1036, "num_input_tokens_seen": 6446664, "step": 33815 }, { "epoch": 17.577962577962577, "grad_norm": 1.0572471618652344, "learning_rate": 2.20157319364179e-06, "loss": 0.0883, "num_input_tokens_seen": 6447592, "step": 33820 }, { "epoch": 17.58056133056133, "grad_norm": 0.4802098274230957, "learning_rate": 2.1969227251229547e-06, "loss": 0.2043, "num_input_tokens_seen": 6448520, "step": 33825 }, { "epoch": 17.583160083160084, "grad_norm": 0.18618495762348175, "learning_rate": 2.1922769477405097e-06, "loss": 0.1, "num_input_tokens_seen": 6449448, "step": 33830 }, { "epoch": 17.585758835758835, "grad_norm": 0.24616223573684692, "learning_rate": 2.187635862450199e-06, "loss": 0.0768, "num_input_tokens_seen": 6450408, "step": 33835 }, { "epoch": 17.58835758835759, "grad_norm": 0.6768829226493835, "learning_rate": 2.1829994702068146e-06, "loss": 0.1102, "num_input_tokens_seen": 6451304, "step": 33840 }, { "epoch": 17.590956340956343, "grad_norm": 0.5947362780570984, "learning_rate": 2.178367771964168e-06, "loss": 0.1633, "num_input_tokens_seen": 6452264, "step": 33845 }, { "epoch": 17.593555093555093, "grad_norm": 0.6410449743270874, "learning_rate": 2.173740768675106e-06, "loss": 0.1786, "num_input_tokens_seen": 6453256, "step": 33850 }, { "epoch": 17.596153846153847, "grad_norm": 0.8068812489509583, "learning_rate": 2.1691184612915196e-06, "loss": 0.1945, "num_input_tokens_seen": 6454152, "step": 33855 }, { "epoch": 17.598752598752597, "grad_norm": 0.15048879384994507, "learning_rate": 2.1645008507643254e-06, "loss": 0.2106, "num_input_tokens_seen": 6455048, "step": 33860 }, { "epoch": 17.60135135135135, "grad_norm": 1.9563639163970947, "learning_rate": 2.1598879380434756e-06, "loss": 0.1387, "num_input_tokens_seen": 6456008, "step": 33865 }, { "epoch": 17.603950103950105, "grad_norm": 1.4246172904968262, "learning_rate": 2.155279724077952e-06, "loss": 0.1421, "num_input_tokens_seen": 6456904, "step": 33870 }, { "epoch": 17.606548856548855, "grad_norm": 0.6291581988334656, "learning_rate": 2.150676209815783e-06, "loss": 0.1249, "num_input_tokens_seen": 6457832, "step": 33875 }, { "epoch": 17.60914760914761, "grad_norm": 0.8591254353523254, "learning_rate": 2.1460773962040158e-06, "loss": 0.0816, "num_input_tokens_seen": 6458728, "step": 33880 }, { "epoch": 17.611746361746363, "grad_norm": 0.39417943358421326, "learning_rate": 2.141483284188736e-06, "loss": 0.0863, "num_input_tokens_seen": 6459656, "step": 33885 }, { "epoch": 17.614345114345113, "grad_norm": 1.4359393119812012, "learning_rate": 2.136893874715065e-06, "loss": 0.1894, "num_input_tokens_seen": 6460648, "step": 33890 }, { "epoch": 17.616943866943867, "grad_norm": 0.17493125796318054, "learning_rate": 2.132309168727148e-06, "loss": 0.1695, "num_input_tokens_seen": 6461640, "step": 33895 }, { "epoch": 17.61954261954262, "grad_norm": 0.7514751553535461, "learning_rate": 2.1277291671681716e-06, "loss": 0.0929, "num_input_tokens_seen": 6462600, "step": 33900 }, { "epoch": 17.62214137214137, "grad_norm": 0.8809252381324768, "learning_rate": 2.1231538709803487e-06, "loss": 0.1298, "num_input_tokens_seen": 6463560, "step": 33905 }, { "epoch": 17.624740124740125, "grad_norm": 0.4495885968208313, "learning_rate": 2.1185832811049327e-06, "loss": 0.1643, "num_input_tokens_seen": 6464456, "step": 33910 }, { "epoch": 17.62733887733888, "grad_norm": 0.7770628929138184, "learning_rate": 2.1140173984821987e-06, "loss": 0.0829, "num_input_tokens_seen": 6465448, "step": 33915 }, { "epoch": 17.62993762993763, "grad_norm": 0.7430617213249207, "learning_rate": 2.1094562240514563e-06, "loss": 0.1057, "num_input_tokens_seen": 6466504, "step": 33920 }, { "epoch": 17.632536382536383, "grad_norm": 0.6033602952957153, "learning_rate": 2.1048997587510473e-06, "loss": 0.1211, "num_input_tokens_seen": 6467432, "step": 33925 }, { "epoch": 17.635135135135137, "grad_norm": 0.9720961451530457, "learning_rate": 2.1003480035183513e-06, "loss": 0.2286, "num_input_tokens_seen": 6468456, "step": 33930 }, { "epoch": 17.637733887733887, "grad_norm": 0.14340078830718994, "learning_rate": 2.0958009592897705e-06, "loss": 0.1041, "num_input_tokens_seen": 6469448, "step": 33935 }, { "epoch": 17.64033264033264, "grad_norm": 1.1962103843688965, "learning_rate": 2.0912586270007388e-06, "loss": 0.1874, "num_input_tokens_seen": 6470376, "step": 33940 }, { "epoch": 17.64293139293139, "grad_norm": 0.1779811978340149, "learning_rate": 2.086721007585715e-06, "loss": 0.1768, "num_input_tokens_seen": 6471368, "step": 33945 }, { "epoch": 17.645530145530145, "grad_norm": 0.15954624116420746, "learning_rate": 2.0821881019782134e-06, "loss": 0.1092, "num_input_tokens_seen": 6472296, "step": 33950 }, { "epoch": 17.6481288981289, "grad_norm": 1.2031068801879883, "learning_rate": 2.0776599111107466e-06, "loss": 0.1234, "num_input_tokens_seen": 6473320, "step": 33955 }, { "epoch": 17.65072765072765, "grad_norm": 0.16479162871837616, "learning_rate": 2.0731364359148687e-06, "loss": 0.068, "num_input_tokens_seen": 6474312, "step": 33960 }, { "epoch": 17.653326403326403, "grad_norm": 0.1586315631866455, "learning_rate": 2.0686176773211776e-06, "loss": 0.0687, "num_input_tokens_seen": 6475240, "step": 33965 }, { "epoch": 17.655925155925157, "grad_norm": 0.6830190420150757, "learning_rate": 2.0641036362592818e-06, "loss": 0.1341, "num_input_tokens_seen": 6476232, "step": 33970 }, { "epoch": 17.658523908523907, "grad_norm": 0.26460936665534973, "learning_rate": 2.0595943136578284e-06, "loss": 0.046, "num_input_tokens_seen": 6477128, "step": 33975 }, { "epoch": 17.66112266112266, "grad_norm": 1.14389967918396, "learning_rate": 2.055089710444497e-06, "loss": 0.0937, "num_input_tokens_seen": 6478120, "step": 33980 }, { "epoch": 17.663721413721415, "grad_norm": 0.17848676443099976, "learning_rate": 2.05058982754599e-06, "loss": 0.068, "num_input_tokens_seen": 6479016, "step": 33985 }, { "epoch": 17.666320166320165, "grad_norm": 0.6516396999359131, "learning_rate": 2.0460946658880376e-06, "loss": 0.156, "num_input_tokens_seen": 6480040, "step": 33990 }, { "epoch": 17.66891891891892, "grad_norm": 0.44468480348587036, "learning_rate": 2.0416042263953996e-06, "loss": 0.0885, "num_input_tokens_seen": 6481000, "step": 33995 }, { "epoch": 17.671517671517673, "grad_norm": 0.27275729179382324, "learning_rate": 2.0371185099918756e-06, "loss": 0.3068, "num_input_tokens_seen": 6481992, "step": 34000 }, { "epoch": 17.674116424116423, "grad_norm": 0.2508021891117096, "learning_rate": 2.0326375176002777e-06, "loss": 0.0763, "num_input_tokens_seen": 6482984, "step": 34005 }, { "epoch": 17.676715176715177, "grad_norm": 1.5953150987625122, "learning_rate": 2.0281612501424546e-06, "loss": 0.2626, "num_input_tokens_seen": 6483912, "step": 34010 }, { "epoch": 17.679313929313928, "grad_norm": 0.6838648915290833, "learning_rate": 2.023689708539278e-06, "loss": 0.0888, "num_input_tokens_seen": 6484776, "step": 34015 }, { "epoch": 17.68191268191268, "grad_norm": 0.6190448999404907, "learning_rate": 2.019222893710657e-06, "loss": 0.1527, "num_input_tokens_seen": 6485704, "step": 34020 }, { "epoch": 17.684511434511435, "grad_norm": 0.5562171936035156, "learning_rate": 2.014760806575516e-06, "loss": 0.2162, "num_input_tokens_seen": 6486664, "step": 34025 }, { "epoch": 17.687110187110186, "grad_norm": 0.12911757826805115, "learning_rate": 2.010303448051809e-06, "loss": 0.1425, "num_input_tokens_seen": 6487624, "step": 34030 }, { "epoch": 17.68970893970894, "grad_norm": 1.1050199270248413, "learning_rate": 2.0058508190565315e-06, "loss": 0.2093, "num_input_tokens_seen": 6488584, "step": 34035 }, { "epoch": 17.692307692307693, "grad_norm": 0.1307939887046814, "learning_rate": 2.0014029205056935e-06, "loss": 0.0616, "num_input_tokens_seen": 6489544, "step": 34040 }, { "epoch": 17.694906444906444, "grad_norm": 0.23187555372714996, "learning_rate": 1.9969597533143285e-06, "loss": 0.0407, "num_input_tokens_seen": 6490504, "step": 34045 }, { "epoch": 17.697505197505198, "grad_norm": 0.6542294025421143, "learning_rate": 1.992521318396498e-06, "loss": 0.1402, "num_input_tokens_seen": 6491464, "step": 34050 }, { "epoch": 17.70010395010395, "grad_norm": 0.396708607673645, "learning_rate": 1.988087616665307e-06, "loss": 0.0331, "num_input_tokens_seen": 6492392, "step": 34055 }, { "epoch": 17.7027027027027, "grad_norm": 0.19002774357795715, "learning_rate": 1.9836586490328627e-06, "loss": 0.0281, "num_input_tokens_seen": 6493256, "step": 34060 }, { "epoch": 17.705301455301456, "grad_norm": 0.7742255330085754, "learning_rate": 1.9792344164103167e-06, "loss": 0.2953, "num_input_tokens_seen": 6494248, "step": 34065 }, { "epoch": 17.70790020790021, "grad_norm": 0.12962129712104797, "learning_rate": 1.974814919707829e-06, "loss": 0.1263, "num_input_tokens_seen": 6495112, "step": 34070 }, { "epoch": 17.71049896049896, "grad_norm": 0.34771570563316345, "learning_rate": 1.9704001598346066e-06, "loss": 0.2709, "num_input_tokens_seen": 6496040, "step": 34075 }, { "epoch": 17.713097713097714, "grad_norm": 0.06605081260204315, "learning_rate": 1.965990137698867e-06, "loss": 0.1396, "num_input_tokens_seen": 6496936, "step": 34080 }, { "epoch": 17.715696465696467, "grad_norm": 0.11897895485162735, "learning_rate": 1.9615848542078567e-06, "loss": 0.1008, "num_input_tokens_seen": 6497832, "step": 34085 }, { "epoch": 17.718295218295218, "grad_norm": 1.0706647634506226, "learning_rate": 1.9571843102678455e-06, "loss": 0.2372, "num_input_tokens_seen": 6498920, "step": 34090 }, { "epoch": 17.72089397089397, "grad_norm": 0.20628878474235535, "learning_rate": 1.9527885067841282e-06, "loss": 0.0805, "num_input_tokens_seen": 6499752, "step": 34095 }, { "epoch": 17.723492723492722, "grad_norm": 0.18426433205604553, "learning_rate": 1.948397444661029e-06, "loss": 0.0893, "num_input_tokens_seen": 6500712, "step": 34100 }, { "epoch": 17.726091476091476, "grad_norm": 0.48465633392333984, "learning_rate": 1.9440111248018976e-06, "loss": 0.1765, "num_input_tokens_seen": 6501672, "step": 34105 }, { "epoch": 17.72869022869023, "grad_norm": 0.3725895285606384, "learning_rate": 1.939629548109101e-06, "loss": 0.2215, "num_input_tokens_seen": 6502664, "step": 34110 }, { "epoch": 17.73128898128898, "grad_norm": 1.4817317724227905, "learning_rate": 1.9352527154840345e-06, "loss": 0.1151, "num_input_tokens_seen": 6503624, "step": 34115 }, { "epoch": 17.733887733887734, "grad_norm": 0.3854127526283264, "learning_rate": 1.9308806278271143e-06, "loss": 0.0929, "num_input_tokens_seen": 6504520, "step": 34120 }, { "epoch": 17.736486486486488, "grad_norm": 0.17988336086273193, "learning_rate": 1.9265132860377872e-06, "loss": 0.0731, "num_input_tokens_seen": 6505512, "step": 34125 }, { "epoch": 17.739085239085238, "grad_norm": 0.25407540798187256, "learning_rate": 1.922150691014518e-06, "loss": 0.1284, "num_input_tokens_seen": 6506472, "step": 34130 }, { "epoch": 17.741683991683992, "grad_norm": 0.670322060585022, "learning_rate": 1.9177928436547946e-06, "loss": 0.1393, "num_input_tokens_seen": 6507400, "step": 34135 }, { "epoch": 17.744282744282746, "grad_norm": 0.057761501520872116, "learning_rate": 1.9134397448551305e-06, "loss": 0.1167, "num_input_tokens_seen": 6508232, "step": 34140 }, { "epoch": 17.746881496881496, "grad_norm": 0.24735096096992493, "learning_rate": 1.9090913955110644e-06, "loss": 0.0848, "num_input_tokens_seen": 6509192, "step": 34145 }, { "epoch": 17.74948024948025, "grad_norm": 0.3523409366607666, "learning_rate": 1.9047477965171618e-06, "loss": 0.1169, "num_input_tokens_seen": 6510120, "step": 34150 }, { "epoch": 17.752079002079, "grad_norm": 0.14367076754570007, "learning_rate": 1.9004089487669853e-06, "loss": 0.1444, "num_input_tokens_seen": 6511048, "step": 34155 }, { "epoch": 17.754677754677754, "grad_norm": 0.3720835745334625, "learning_rate": 1.8960748531531542e-06, "loss": 0.0922, "num_input_tokens_seen": 6512040, "step": 34160 }, { "epoch": 17.757276507276508, "grad_norm": 2.85319447517395, "learning_rate": 1.8917455105672943e-06, "loss": 0.2737, "num_input_tokens_seen": 6513032, "step": 34165 }, { "epoch": 17.75987525987526, "grad_norm": 0.9740196466445923, "learning_rate": 1.8874209219000488e-06, "loss": 0.1289, "num_input_tokens_seen": 6513928, "step": 34170 }, { "epoch": 17.762474012474012, "grad_norm": 0.4378223717212677, "learning_rate": 1.8831010880410893e-06, "loss": 0.1116, "num_input_tokens_seen": 6514920, "step": 34175 }, { "epoch": 17.765072765072766, "grad_norm": 0.6381381154060364, "learning_rate": 1.878786009879116e-06, "loss": 0.0794, "num_input_tokens_seen": 6515816, "step": 34180 }, { "epoch": 17.767671517671516, "grad_norm": 1.8366308212280273, "learning_rate": 1.8744756883018354e-06, "loss": 0.1542, "num_input_tokens_seen": 6516840, "step": 34185 }, { "epoch": 17.77027027027027, "grad_norm": 1.23501455783844, "learning_rate": 1.8701701241959885e-06, "loss": 0.1549, "num_input_tokens_seen": 6517800, "step": 34190 }, { "epoch": 17.772869022869024, "grad_norm": 0.9064979553222656, "learning_rate": 1.8658693184473275e-06, "loss": 0.1156, "num_input_tokens_seen": 6518728, "step": 34195 }, { "epoch": 17.775467775467774, "grad_norm": 0.35122746229171753, "learning_rate": 1.8615732719406337e-06, "loss": 0.1117, "num_input_tokens_seen": 6519688, "step": 34200 }, { "epoch": 17.778066528066528, "grad_norm": 0.23381973803043365, "learning_rate": 1.8572819855597085e-06, "loss": 0.1157, "num_input_tokens_seen": 6520616, "step": 34205 }, { "epoch": 17.780665280665282, "grad_norm": 0.4320047199726105, "learning_rate": 1.852995460187365e-06, "loss": 0.1451, "num_input_tokens_seen": 6521640, "step": 34210 }, { "epoch": 17.783264033264032, "grad_norm": 0.7037123441696167, "learning_rate": 1.8487136967054564e-06, "loss": 0.1224, "num_input_tokens_seen": 6522536, "step": 34215 }, { "epoch": 17.785862785862786, "grad_norm": 0.1562584936618805, "learning_rate": 1.8444366959948278e-06, "loss": 0.0898, "num_input_tokens_seen": 6523464, "step": 34220 }, { "epoch": 17.78846153846154, "grad_norm": 0.14189651608467102, "learning_rate": 1.8401644589353673e-06, "loss": 0.109, "num_input_tokens_seen": 6524392, "step": 34225 }, { "epoch": 17.79106029106029, "grad_norm": 0.6415231227874756, "learning_rate": 1.8358969864059771e-06, "loss": 0.1149, "num_input_tokens_seen": 6525352, "step": 34230 }, { "epoch": 17.793659043659044, "grad_norm": 0.10532578080892563, "learning_rate": 1.8316342792845776e-06, "loss": 0.1256, "num_input_tokens_seen": 6526216, "step": 34235 }, { "epoch": 17.796257796257795, "grad_norm": 0.9010109305381775, "learning_rate": 1.8273763384481086e-06, "loss": 0.1345, "num_input_tokens_seen": 6527176, "step": 34240 }, { "epoch": 17.79885654885655, "grad_norm": 0.20629975199699402, "learning_rate": 1.8231231647725221e-06, "loss": 0.1008, "num_input_tokens_seen": 6528136, "step": 34245 }, { "epoch": 17.801455301455302, "grad_norm": 0.5803614258766174, "learning_rate": 1.8188747591328132e-06, "loss": 0.1185, "num_input_tokens_seen": 6529128, "step": 34250 }, { "epoch": 17.804054054054053, "grad_norm": 1.169053554534912, "learning_rate": 1.814631122402971e-06, "loss": 0.0573, "num_input_tokens_seen": 6530056, "step": 34255 }, { "epoch": 17.806652806652806, "grad_norm": 0.6207475662231445, "learning_rate": 1.8103922554560114e-06, "loss": 0.0951, "num_input_tokens_seen": 6530984, "step": 34260 }, { "epoch": 17.80925155925156, "grad_norm": 0.5075840950012207, "learning_rate": 1.8061581591639705e-06, "loss": 0.0446, "num_input_tokens_seen": 6531976, "step": 34265 }, { "epoch": 17.81185031185031, "grad_norm": 0.2172360122203827, "learning_rate": 1.801928834397909e-06, "loss": 0.0821, "num_input_tokens_seen": 6532968, "step": 34270 }, { "epoch": 17.814449064449065, "grad_norm": 0.727897584438324, "learning_rate": 1.7977042820278956e-06, "loss": 0.1564, "num_input_tokens_seen": 6533928, "step": 34275 }, { "epoch": 17.81704781704782, "grad_norm": 1.3095879554748535, "learning_rate": 1.7934845029230207e-06, "loss": 0.1296, "num_input_tokens_seen": 6534920, "step": 34280 }, { "epoch": 17.81964656964657, "grad_norm": 1.8854718208312988, "learning_rate": 1.789269497951393e-06, "loss": 0.2616, "num_input_tokens_seen": 6535912, "step": 34285 }, { "epoch": 17.822245322245323, "grad_norm": 0.6702097058296204, "learning_rate": 1.7850592679801438e-06, "loss": 0.2627, "num_input_tokens_seen": 6536904, "step": 34290 }, { "epoch": 17.824844074844076, "grad_norm": 0.23227354884147644, "learning_rate": 1.7808538138754132e-06, "loss": 0.0938, "num_input_tokens_seen": 6537864, "step": 34295 }, { "epoch": 17.827442827442827, "grad_norm": 0.11635325849056244, "learning_rate": 1.7766531365023597e-06, "loss": 0.0564, "num_input_tokens_seen": 6538888, "step": 34300 }, { "epoch": 17.83004158004158, "grad_norm": 0.0877356007695198, "learning_rate": 1.7724572367251723e-06, "loss": 0.0919, "num_input_tokens_seen": 6539752, "step": 34305 }, { "epoch": 17.83264033264033, "grad_norm": 0.21925441920757294, "learning_rate": 1.7682661154070411e-06, "loss": 0.0942, "num_input_tokens_seen": 6540744, "step": 34310 }, { "epoch": 17.835239085239085, "grad_norm": 0.8198537230491638, "learning_rate": 1.764079773410185e-06, "loss": 0.1835, "num_input_tokens_seen": 6541640, "step": 34315 }, { "epoch": 17.83783783783784, "grad_norm": 1.0785259008407593, "learning_rate": 1.759898211595823e-06, "loss": 0.19, "num_input_tokens_seen": 6542536, "step": 34320 }, { "epoch": 17.84043659043659, "grad_norm": 0.912510335445404, "learning_rate": 1.755721430824217e-06, "loss": 0.2351, "num_input_tokens_seen": 6543400, "step": 34325 }, { "epoch": 17.843035343035343, "grad_norm": 1.5968729257583618, "learning_rate": 1.7515494319546215e-06, "loss": 0.2396, "num_input_tokens_seen": 6544360, "step": 34330 }, { "epoch": 17.845634095634097, "grad_norm": 0.280783474445343, "learning_rate": 1.747382215845314e-06, "loss": 0.0691, "num_input_tokens_seen": 6545352, "step": 34335 }, { "epoch": 17.848232848232847, "grad_norm": 0.7937212586402893, "learning_rate": 1.7432197833535975e-06, "loss": 0.2042, "num_input_tokens_seen": 6546280, "step": 34340 }, { "epoch": 17.8508316008316, "grad_norm": 1.0880314111709595, "learning_rate": 1.739062135335784e-06, "loss": 0.1465, "num_input_tokens_seen": 6547208, "step": 34345 }, { "epoch": 17.853430353430355, "grad_norm": 0.48379307985305786, "learning_rate": 1.7349092726471894e-06, "loss": 0.1039, "num_input_tokens_seen": 6548104, "step": 34350 }, { "epoch": 17.856029106029105, "grad_norm": 0.5387684106826782, "learning_rate": 1.7307611961421665e-06, "loss": 0.1177, "num_input_tokens_seen": 6549064, "step": 34355 }, { "epoch": 17.85862785862786, "grad_norm": 0.4667969048023224, "learning_rate": 1.7266179066740718e-06, "loss": 0.0634, "num_input_tokens_seen": 6549992, "step": 34360 }, { "epoch": 17.861226611226613, "grad_norm": 0.43383142352104187, "learning_rate": 1.7224794050952787e-06, "loss": 0.1812, "num_input_tokens_seen": 6550920, "step": 34365 }, { "epoch": 17.863825363825363, "grad_norm": 0.23556052148342133, "learning_rate": 1.71834569225717e-06, "loss": 0.0905, "num_input_tokens_seen": 6551848, "step": 34370 }, { "epoch": 17.866424116424117, "grad_norm": 1.365269422531128, "learning_rate": 1.7142167690101574e-06, "loss": 0.1308, "num_input_tokens_seen": 6552840, "step": 34375 }, { "epoch": 17.86902286902287, "grad_norm": 1.0478111505508423, "learning_rate": 1.7100926362036529e-06, "loss": 0.1477, "num_input_tokens_seen": 6553800, "step": 34380 }, { "epoch": 17.87162162162162, "grad_norm": 0.291343629360199, "learning_rate": 1.7059732946860945e-06, "loss": 0.1926, "num_input_tokens_seen": 6554824, "step": 34385 }, { "epoch": 17.874220374220375, "grad_norm": 0.7896120548248291, "learning_rate": 1.7018587453049184e-06, "loss": 0.1881, "num_input_tokens_seen": 6555752, "step": 34390 }, { "epoch": 17.876819126819125, "grad_norm": 0.6679056286811829, "learning_rate": 1.6977489889065973e-06, "loss": 0.1135, "num_input_tokens_seen": 6556712, "step": 34395 }, { "epoch": 17.87941787941788, "grad_norm": 0.18475720286369324, "learning_rate": 1.6936440263366022e-06, "loss": 0.1031, "num_input_tokens_seen": 6557640, "step": 34400 }, { "epoch": 17.882016632016633, "grad_norm": 0.31687989830970764, "learning_rate": 1.6895438584394186e-06, "loss": 0.1593, "num_input_tokens_seen": 6558632, "step": 34405 }, { "epoch": 17.884615384615383, "grad_norm": 0.5330077409744263, "learning_rate": 1.6854484860585523e-06, "loss": 0.1056, "num_input_tokens_seen": 6559720, "step": 34410 }, { "epoch": 17.887214137214137, "grad_norm": 0.8101176619529724, "learning_rate": 1.6813579100365185e-06, "loss": 0.2466, "num_input_tokens_seen": 6560712, "step": 34415 }, { "epoch": 17.88981288981289, "grad_norm": 0.6385747790336609, "learning_rate": 1.6772721312148438e-06, "loss": 0.146, "num_input_tokens_seen": 6561672, "step": 34420 }, { "epoch": 17.89241164241164, "grad_norm": 0.7860945463180542, "learning_rate": 1.673191150434067e-06, "loss": 0.1839, "num_input_tokens_seen": 6562600, "step": 34425 }, { "epoch": 17.895010395010395, "grad_norm": 1.1085189580917358, "learning_rate": 1.6691149685337526e-06, "loss": 0.1684, "num_input_tokens_seen": 6563464, "step": 34430 }, { "epoch": 17.89760914760915, "grad_norm": 0.3417005240917206, "learning_rate": 1.6650435863524632e-06, "loss": 0.1044, "num_input_tokens_seen": 6564392, "step": 34435 }, { "epoch": 17.9002079002079, "grad_norm": 0.6241641044616699, "learning_rate": 1.660977004727779e-06, "loss": 0.0766, "num_input_tokens_seen": 6565320, "step": 34440 }, { "epoch": 17.902806652806653, "grad_norm": 0.19986259937286377, "learning_rate": 1.6569152244962888e-06, "loss": 0.1422, "num_input_tokens_seen": 6566312, "step": 34445 }, { "epoch": 17.905405405405407, "grad_norm": 0.2827025055885315, "learning_rate": 1.6528582464936076e-06, "loss": 0.0345, "num_input_tokens_seen": 6567208, "step": 34450 }, { "epoch": 17.908004158004157, "grad_norm": 0.17840351164340973, "learning_rate": 1.6488060715543457e-06, "loss": 0.1358, "num_input_tokens_seen": 6568168, "step": 34455 }, { "epoch": 17.91060291060291, "grad_norm": 0.07653132826089859, "learning_rate": 1.6447587005121307e-06, "loss": 0.0801, "num_input_tokens_seen": 6569064, "step": 34460 }, { "epoch": 17.91320166320166, "grad_norm": 0.9120785593986511, "learning_rate": 1.6407161341996074e-06, "loss": 0.0701, "num_input_tokens_seen": 6570088, "step": 34465 }, { "epoch": 17.915800415800415, "grad_norm": 0.355011522769928, "learning_rate": 1.6366783734484276e-06, "loss": 0.0948, "num_input_tokens_seen": 6570952, "step": 34470 }, { "epoch": 17.91839916839917, "grad_norm": 0.7861403822898865, "learning_rate": 1.6326454190892516e-06, "loss": 0.0987, "num_input_tokens_seen": 6571848, "step": 34475 }, { "epoch": 17.92099792099792, "grad_norm": 1.4188472032546997, "learning_rate": 1.6286172719517601e-06, "loss": 0.2215, "num_input_tokens_seen": 6572808, "step": 34480 }, { "epoch": 17.923596673596673, "grad_norm": 0.6062750816345215, "learning_rate": 1.624593932864632e-06, "loss": 0.0714, "num_input_tokens_seen": 6573736, "step": 34485 }, { "epoch": 17.926195426195427, "grad_norm": 1.5900384187698364, "learning_rate": 1.6205754026555664e-06, "loss": 0.155, "num_input_tokens_seen": 6574600, "step": 34490 }, { "epoch": 17.928794178794178, "grad_norm": 0.13109035789966583, "learning_rate": 1.6165616821512714e-06, "loss": 0.0505, "num_input_tokens_seen": 6575560, "step": 34495 }, { "epoch": 17.93139293139293, "grad_norm": 0.8709766864776611, "learning_rate": 1.612552772177467e-06, "loss": 0.128, "num_input_tokens_seen": 6576552, "step": 34500 }, { "epoch": 17.933991683991685, "grad_norm": 0.06942490488290787, "learning_rate": 1.6085486735588768e-06, "loss": 0.0544, "num_input_tokens_seen": 6577512, "step": 34505 }, { "epoch": 17.936590436590436, "grad_norm": 0.8217112421989441, "learning_rate": 1.6045493871192446e-06, "loss": 0.1165, "num_input_tokens_seen": 6578504, "step": 34510 }, { "epoch": 17.93918918918919, "grad_norm": 0.044331006705760956, "learning_rate": 1.6005549136813124e-06, "loss": 0.1762, "num_input_tokens_seen": 6579464, "step": 34515 }, { "epoch": 17.941787941787943, "grad_norm": 0.35137611627578735, "learning_rate": 1.5965652540668479e-06, "loss": 0.0637, "num_input_tokens_seen": 6580424, "step": 34520 }, { "epoch": 17.944386694386694, "grad_norm": 0.5339072346687317, "learning_rate": 1.5925804090966113e-06, "loss": 0.2097, "num_input_tokens_seen": 6581416, "step": 34525 }, { "epoch": 17.946985446985448, "grad_norm": 0.46081843972206116, "learning_rate": 1.5886003795903853e-06, "loss": 0.107, "num_input_tokens_seen": 6582440, "step": 34530 }, { "epoch": 17.9495841995842, "grad_norm": 0.8340989351272583, "learning_rate": 1.5846251663669487e-06, "loss": 0.1273, "num_input_tokens_seen": 6583464, "step": 34535 }, { "epoch": 17.95218295218295, "grad_norm": 0.13744446635246277, "learning_rate": 1.5806547702441083e-06, "loss": 0.0943, "num_input_tokens_seen": 6584328, "step": 34540 }, { "epoch": 17.954781704781706, "grad_norm": 0.7421625852584839, "learning_rate": 1.5766891920386633e-06, "loss": 0.0972, "num_input_tokens_seen": 6585256, "step": 34545 }, { "epoch": 17.957380457380456, "grad_norm": 1.1394755840301514, "learning_rate": 1.5727284325664253e-06, "loss": 0.1851, "num_input_tokens_seen": 6586280, "step": 34550 }, { "epoch": 17.95997920997921, "grad_norm": 1.225983738899231, "learning_rate": 1.5687724926422205e-06, "loss": 0.0812, "num_input_tokens_seen": 6587272, "step": 34555 }, { "epoch": 17.962577962577964, "grad_norm": 0.1878734976053238, "learning_rate": 1.5648213730798805e-06, "loss": 0.0707, "num_input_tokens_seen": 6588232, "step": 34560 }, { "epoch": 17.965176715176714, "grad_norm": 0.2929338216781616, "learning_rate": 1.5608750746922446e-06, "loss": 0.0815, "num_input_tokens_seen": 6589224, "step": 34565 }, { "epoch": 17.967775467775468, "grad_norm": 0.6544225215911865, "learning_rate": 1.5569335982911549e-06, "loss": 0.095, "num_input_tokens_seen": 6590216, "step": 34570 }, { "epoch": 17.97037422037422, "grad_norm": 0.9537798762321472, "learning_rate": 1.5529969446874764e-06, "loss": 0.1634, "num_input_tokens_seen": 6591112, "step": 34575 }, { "epoch": 17.972972972972972, "grad_norm": 0.13187973201274872, "learning_rate": 1.5490651146910673e-06, "loss": 0.0602, "num_input_tokens_seen": 6592104, "step": 34580 }, { "epoch": 17.975571725571726, "grad_norm": 0.5612656474113464, "learning_rate": 1.5451381091107941e-06, "loss": 0.1213, "num_input_tokens_seen": 6593096, "step": 34585 }, { "epoch": 17.97817047817048, "grad_norm": 0.292344331741333, "learning_rate": 1.5412159287545468e-06, "loss": 0.1078, "num_input_tokens_seen": 6594120, "step": 34590 }, { "epoch": 17.98076923076923, "grad_norm": 0.6696272492408752, "learning_rate": 1.5372985744292024e-06, "loss": 0.1878, "num_input_tokens_seen": 6595048, "step": 34595 }, { "epoch": 17.983367983367984, "grad_norm": 0.22616468369960785, "learning_rate": 1.5333860469406574e-06, "loss": 0.1169, "num_input_tokens_seen": 6595912, "step": 34600 }, { "epoch": 17.985966735966738, "grad_norm": 0.4195953607559204, "learning_rate": 1.529478347093813e-06, "loss": 0.0903, "num_input_tokens_seen": 6596808, "step": 34605 }, { "epoch": 17.988565488565488, "grad_norm": 0.5190683603286743, "learning_rate": 1.5255754756925728e-06, "loss": 0.1952, "num_input_tokens_seen": 6597800, "step": 34610 }, { "epoch": 17.991164241164242, "grad_norm": 0.27629077434539795, "learning_rate": 1.5216774335398532e-06, "loss": 0.0934, "num_input_tokens_seen": 6598792, "step": 34615 }, { "epoch": 17.993762993762992, "grad_norm": 0.11086075752973557, "learning_rate": 1.517784221437568e-06, "loss": 0.1356, "num_input_tokens_seen": 6599752, "step": 34620 }, { "epoch": 17.996361746361746, "grad_norm": 1.1345655918121338, "learning_rate": 1.513895840186655e-06, "loss": 0.0888, "num_input_tokens_seen": 6600744, "step": 34625 }, { "epoch": 17.9989604989605, "grad_norm": 1.1382807493209839, "learning_rate": 1.5100122905870373e-06, "loss": 0.109, "num_input_tokens_seen": 6601608, "step": 34630 }, { "epoch": 18.0, "eval_loss": 0.14623308181762695, "eval_runtime": 8.0255, "eval_samples_per_second": 106.661, "eval_steps_per_second": 26.665, "num_input_tokens_seen": 6601944, "step": 34632 }, { "epoch": 18.00155925155925, "grad_norm": 0.25825533270835876, "learning_rate": 1.5061335734376598e-06, "loss": 0.0397, "num_input_tokens_seen": 6602488, "step": 34635 }, { "epoch": 18.004158004158004, "grad_norm": 0.1877347081899643, "learning_rate": 1.5022596895364615e-06, "loss": 0.0782, "num_input_tokens_seen": 6603416, "step": 34640 }, { "epoch": 18.006756756756758, "grad_norm": 1.3123600482940674, "learning_rate": 1.4983906396803972e-06, "loss": 0.173, "num_input_tokens_seen": 6604376, "step": 34645 }, { "epoch": 18.009355509355508, "grad_norm": 0.19066187739372253, "learning_rate": 1.4945264246654212e-06, "loss": 0.1012, "num_input_tokens_seen": 6605368, "step": 34650 }, { "epoch": 18.011954261954262, "grad_norm": 0.11493764072656631, "learning_rate": 1.490667045286495e-06, "loss": 0.0162, "num_input_tokens_seen": 6606296, "step": 34655 }, { "epoch": 18.014553014553016, "grad_norm": 1.205531358718872, "learning_rate": 1.486812502337581e-06, "loss": 0.1872, "num_input_tokens_seen": 6607192, "step": 34660 }, { "epoch": 18.017151767151766, "grad_norm": 0.49601635336875916, "learning_rate": 1.482962796611656e-06, "loss": 0.1185, "num_input_tokens_seen": 6608152, "step": 34665 }, { "epoch": 18.01975051975052, "grad_norm": 1.633614420890808, "learning_rate": 1.4791179289006973e-06, "loss": 0.4231, "num_input_tokens_seen": 6609112, "step": 34670 }, { "epoch": 18.022349272349274, "grad_norm": 0.7627578973770142, "learning_rate": 1.475277899995678e-06, "loss": 0.1364, "num_input_tokens_seen": 6609976, "step": 34675 }, { "epoch": 18.024948024948024, "grad_norm": 0.1846265196800232, "learning_rate": 1.4714427106865909e-06, "loss": 0.083, "num_input_tokens_seen": 6610840, "step": 34680 }, { "epoch": 18.027546777546778, "grad_norm": 0.15871834754943848, "learning_rate": 1.4676123617624215e-06, "loss": 0.1065, "num_input_tokens_seen": 6611736, "step": 34685 }, { "epoch": 18.03014553014553, "grad_norm": 0.15100881457328796, "learning_rate": 1.463786854011165e-06, "loss": 0.0821, "num_input_tokens_seen": 6612664, "step": 34690 }, { "epoch": 18.032744282744282, "grad_norm": 0.08508285135030746, "learning_rate": 1.459966188219819e-06, "loss": 0.0721, "num_input_tokens_seen": 6613560, "step": 34695 }, { "epoch": 18.035343035343036, "grad_norm": 0.08422809839248657, "learning_rate": 1.4561503651743886e-06, "loss": 0.1654, "num_input_tokens_seen": 6614520, "step": 34700 }, { "epoch": 18.037941787941786, "grad_norm": 0.8167506456375122, "learning_rate": 1.452339385659876e-06, "loss": 0.1337, "num_input_tokens_seen": 6615512, "step": 34705 }, { "epoch": 18.04054054054054, "grad_norm": 0.3881911039352417, "learning_rate": 1.4485332504602905e-06, "loss": 0.1031, "num_input_tokens_seen": 6616440, "step": 34710 }, { "epoch": 18.043139293139294, "grad_norm": 0.9702643156051636, "learning_rate": 1.4447319603586473e-06, "loss": 0.1415, "num_input_tokens_seen": 6617336, "step": 34715 }, { "epoch": 18.045738045738045, "grad_norm": 1.0211223363876343, "learning_rate": 1.4409355161369626e-06, "loss": 0.192, "num_input_tokens_seen": 6618264, "step": 34720 }, { "epoch": 18.0483367983368, "grad_norm": 1.8358525037765503, "learning_rate": 1.437143918576253e-06, "loss": 0.1922, "num_input_tokens_seen": 6619224, "step": 34725 }, { "epoch": 18.050935550935552, "grad_norm": 1.038352131843567, "learning_rate": 1.4333571684565395e-06, "loss": 0.1325, "num_input_tokens_seen": 6620152, "step": 34730 }, { "epoch": 18.053534303534303, "grad_norm": 0.5763375759124756, "learning_rate": 1.4295752665568567e-06, "loss": 0.1229, "num_input_tokens_seen": 6621144, "step": 34735 }, { "epoch": 18.056133056133056, "grad_norm": 0.25235602259635925, "learning_rate": 1.4257982136552189e-06, "loss": 0.1157, "num_input_tokens_seen": 6622136, "step": 34740 }, { "epoch": 18.05873180873181, "grad_norm": 1.4264122247695923, "learning_rate": 1.422026010528657e-06, "loss": 0.1025, "num_input_tokens_seen": 6623128, "step": 34745 }, { "epoch": 18.06133056133056, "grad_norm": 0.5600727796554565, "learning_rate": 1.4182586579532114e-06, "loss": 0.1885, "num_input_tokens_seen": 6624056, "step": 34750 }, { "epoch": 18.063929313929314, "grad_norm": 0.4410608112812042, "learning_rate": 1.4144961567039121e-06, "loss": 0.1268, "num_input_tokens_seen": 6625048, "step": 34755 }, { "epoch": 18.066528066528065, "grad_norm": 1.1073616743087769, "learning_rate": 1.4107385075547957e-06, "loss": 0.106, "num_input_tokens_seen": 6626072, "step": 34760 }, { "epoch": 18.06912681912682, "grad_norm": 0.5920337438583374, "learning_rate": 1.4069857112788936e-06, "loss": 0.076, "num_input_tokens_seen": 6627064, "step": 34765 }, { "epoch": 18.071725571725572, "grad_norm": 1.45713472366333, "learning_rate": 1.4032377686482578e-06, "loss": 0.1868, "num_input_tokens_seen": 6627960, "step": 34770 }, { "epoch": 18.074324324324323, "grad_norm": 0.27822259068489075, "learning_rate": 1.3994946804339243e-06, "loss": 0.0718, "num_input_tokens_seen": 6628984, "step": 34775 }, { "epoch": 18.076923076923077, "grad_norm": 0.9520782828330994, "learning_rate": 1.3957564474059326e-06, "loss": 0.1856, "num_input_tokens_seen": 6630008, "step": 34780 }, { "epoch": 18.07952182952183, "grad_norm": 0.3529309928417206, "learning_rate": 1.3920230703333231e-06, "loss": 0.1303, "num_input_tokens_seen": 6630904, "step": 34785 }, { "epoch": 18.08212058212058, "grad_norm": 1.3602559566497803, "learning_rate": 1.388294549984151e-06, "loss": 0.1119, "num_input_tokens_seen": 6631800, "step": 34790 }, { "epoch": 18.084719334719335, "grad_norm": 1.282310128211975, "learning_rate": 1.3845708871254553e-06, "loss": 0.0858, "num_input_tokens_seen": 6632760, "step": 34795 }, { "epoch": 18.08731808731809, "grad_norm": 0.29086047410964966, "learning_rate": 1.3808520825232817e-06, "loss": 0.0958, "num_input_tokens_seen": 6633752, "step": 34800 }, { "epoch": 18.08991683991684, "grad_norm": 0.667778730392456, "learning_rate": 1.3771381369426822e-06, "loss": 0.0425, "num_input_tokens_seen": 6634680, "step": 34805 }, { "epoch": 18.092515592515593, "grad_norm": 0.4529728293418884, "learning_rate": 1.3734290511476955e-06, "loss": 0.123, "num_input_tokens_seen": 6635576, "step": 34810 }, { "epoch": 18.095114345114347, "grad_norm": 1.1537240743637085, "learning_rate": 1.3697248259013779e-06, "loss": 0.2086, "num_input_tokens_seen": 6636504, "step": 34815 }, { "epoch": 18.097713097713097, "grad_norm": 0.4809413254261017, "learning_rate": 1.3660254619657642e-06, "loss": 0.1814, "num_input_tokens_seen": 6637432, "step": 34820 }, { "epoch": 18.10031185031185, "grad_norm": 0.186554953455925, "learning_rate": 1.362330960101918e-06, "loss": 0.0649, "num_input_tokens_seen": 6638424, "step": 34825 }, { "epoch": 18.102910602910605, "grad_norm": 1.422639012336731, "learning_rate": 1.3586413210698756e-06, "loss": 0.1416, "num_input_tokens_seen": 6639352, "step": 34830 }, { "epoch": 18.105509355509355, "grad_norm": 1.2249020338058472, "learning_rate": 1.3549565456286829e-06, "loss": 0.1723, "num_input_tokens_seen": 6640408, "step": 34835 }, { "epoch": 18.10810810810811, "grad_norm": 1.0726529359817505, "learning_rate": 1.3512766345363942e-06, "loss": 0.0907, "num_input_tokens_seen": 6641336, "step": 34840 }, { "epoch": 18.11070686070686, "grad_norm": 0.34746548533439636, "learning_rate": 1.3476015885500487e-06, "loss": 0.0486, "num_input_tokens_seen": 6642232, "step": 34845 }, { "epoch": 18.113305613305613, "grad_norm": 1.0736911296844482, "learning_rate": 1.3439314084256916e-06, "loss": 0.2875, "num_input_tokens_seen": 6643128, "step": 34850 }, { "epoch": 18.115904365904367, "grad_norm": 0.6247662901878357, "learning_rate": 1.340266094918366e-06, "loss": 0.1901, "num_input_tokens_seen": 6643992, "step": 34855 }, { "epoch": 18.118503118503117, "grad_norm": 0.19258545339107513, "learning_rate": 1.3366056487821189e-06, "loss": 0.066, "num_input_tokens_seen": 6644952, "step": 34860 }, { "epoch": 18.12110187110187, "grad_norm": 0.855655312538147, "learning_rate": 1.3329500707699893e-06, "loss": 0.114, "num_input_tokens_seen": 6645912, "step": 34865 }, { "epoch": 18.123700623700625, "grad_norm": 0.10257887840270996, "learning_rate": 1.3292993616340093e-06, "loss": 0.1557, "num_input_tokens_seen": 6646872, "step": 34870 }, { "epoch": 18.126299376299375, "grad_norm": 0.8424166440963745, "learning_rate": 1.3256535221252254e-06, "loss": 0.0404, "num_input_tokens_seen": 6647832, "step": 34875 }, { "epoch": 18.12889812889813, "grad_norm": 0.4483269453048706, "learning_rate": 1.3220125529936734e-06, "loss": 0.0641, "num_input_tokens_seen": 6648792, "step": 34880 }, { "epoch": 18.131496881496883, "grad_norm": 0.9429872035980225, "learning_rate": 1.318376454988382e-06, "loss": 0.1289, "num_input_tokens_seen": 6649816, "step": 34885 }, { "epoch": 18.134095634095633, "grad_norm": 0.8004177808761597, "learning_rate": 1.3147452288573835e-06, "loss": 0.0982, "num_input_tokens_seen": 6650744, "step": 34890 }, { "epoch": 18.136694386694387, "grad_norm": 0.30069699883461, "learning_rate": 1.3111188753477166e-06, "loss": 0.0859, "num_input_tokens_seen": 6651800, "step": 34895 }, { "epoch": 18.13929313929314, "grad_norm": 0.3116265833377838, "learning_rate": 1.3074973952053982e-06, "loss": 0.0641, "num_input_tokens_seen": 6652760, "step": 34900 }, { "epoch": 18.14189189189189, "grad_norm": 0.26513656973838806, "learning_rate": 1.3038807891754601e-06, "loss": 0.0473, "num_input_tokens_seen": 6653688, "step": 34905 }, { "epoch": 18.144490644490645, "grad_norm": 0.5001218318939209, "learning_rate": 1.3002690580019184e-06, "loss": 0.0663, "num_input_tokens_seen": 6654616, "step": 34910 }, { "epoch": 18.147089397089395, "grad_norm": 0.4565914571285248, "learning_rate": 1.2966622024277952e-06, "loss": 0.0397, "num_input_tokens_seen": 6655640, "step": 34915 }, { "epoch": 18.14968814968815, "grad_norm": 3.003896713256836, "learning_rate": 1.293060223195111e-06, "loss": 0.0992, "num_input_tokens_seen": 6656632, "step": 34920 }, { "epoch": 18.152286902286903, "grad_norm": 0.04951610043644905, "learning_rate": 1.2894631210448733e-06, "loss": 0.1294, "num_input_tokens_seen": 6657528, "step": 34925 }, { "epoch": 18.154885654885653, "grad_norm": 0.6167680025100708, "learning_rate": 1.2858708967170868e-06, "loss": 0.1521, "num_input_tokens_seen": 6658520, "step": 34930 }, { "epoch": 18.157484407484407, "grad_norm": 1.0077245235443115, "learning_rate": 1.2822835509507746e-06, "loss": 0.1553, "num_input_tokens_seen": 6659512, "step": 34935 }, { "epoch": 18.16008316008316, "grad_norm": 0.13631992042064667, "learning_rate": 1.2787010844839214e-06, "loss": 0.0949, "num_input_tokens_seen": 6660536, "step": 34940 }, { "epoch": 18.16268191268191, "grad_norm": 0.8758823275566101, "learning_rate": 1.275123498053532e-06, "loss": 0.1744, "num_input_tokens_seen": 6661528, "step": 34945 }, { "epoch": 18.165280665280665, "grad_norm": 0.32457399368286133, "learning_rate": 1.2715507923956038e-06, "loss": 0.0606, "num_input_tokens_seen": 6662488, "step": 34950 }, { "epoch": 18.16787941787942, "grad_norm": 1.0472397804260254, "learning_rate": 1.2679829682451295e-06, "loss": 0.2305, "num_input_tokens_seen": 6663384, "step": 34955 }, { "epoch": 18.17047817047817, "grad_norm": 0.44091957807540894, "learning_rate": 1.2644200263360857e-06, "loss": 0.1189, "num_input_tokens_seen": 6664344, "step": 34960 }, { "epoch": 18.173076923076923, "grad_norm": 0.857327401638031, "learning_rate": 1.2608619674014638e-06, "loss": 0.0959, "num_input_tokens_seen": 6665304, "step": 34965 }, { "epoch": 18.175675675675677, "grad_norm": 0.03538886457681656, "learning_rate": 1.2573087921732368e-06, "loss": 0.076, "num_input_tokens_seen": 6666296, "step": 34970 }, { "epoch": 18.178274428274428, "grad_norm": 1.9977295398712158, "learning_rate": 1.253760501382381e-06, "loss": 0.0858, "num_input_tokens_seen": 6667224, "step": 34975 }, { "epoch": 18.18087318087318, "grad_norm": 0.6109523177146912, "learning_rate": 1.25021709575886e-06, "loss": 0.1056, "num_input_tokens_seen": 6668184, "step": 34980 }, { "epoch": 18.183471933471935, "grad_norm": 0.04118135944008827, "learning_rate": 1.2466785760316401e-06, "loss": 0.0825, "num_input_tokens_seen": 6669144, "step": 34985 }, { "epoch": 18.186070686070686, "grad_norm": 0.6677102446556091, "learning_rate": 1.2431449429286784e-06, "loss": 0.1062, "num_input_tokens_seen": 6670168, "step": 34990 }, { "epoch": 18.18866943866944, "grad_norm": 0.49621105194091797, "learning_rate": 1.2396161971769292e-06, "loss": 0.1176, "num_input_tokens_seen": 6671160, "step": 34995 }, { "epoch": 18.19126819126819, "grad_norm": 1.0053914785385132, "learning_rate": 1.2360923395023343e-06, "loss": 0.1404, "num_input_tokens_seen": 6672120, "step": 35000 }, { "epoch": 18.193866943866944, "grad_norm": 1.4955271482467651, "learning_rate": 1.2325733706298414e-06, "loss": 0.3072, "num_input_tokens_seen": 6673080, "step": 35005 }, { "epoch": 18.196465696465697, "grad_norm": 1.209835410118103, "learning_rate": 1.2290592912833798e-06, "loss": 0.2858, "num_input_tokens_seen": 6674008, "step": 35010 }, { "epoch": 18.199064449064448, "grad_norm": 1.106561303138733, "learning_rate": 1.2255501021858822e-06, "loss": 0.2215, "num_input_tokens_seen": 6674968, "step": 35015 }, { "epoch": 18.2016632016632, "grad_norm": 0.6819608211517334, "learning_rate": 1.2220458040592769e-06, "loss": 0.1678, "num_input_tokens_seen": 6675896, "step": 35020 }, { "epoch": 18.204261954261955, "grad_norm": 0.1928943693637848, "learning_rate": 1.218546397624476e-06, "loss": 0.1537, "num_input_tokens_seen": 6676856, "step": 35025 }, { "epoch": 18.206860706860706, "grad_norm": 0.1566474288702011, "learning_rate": 1.2150518836013924e-06, "loss": 0.2026, "num_input_tokens_seen": 6677720, "step": 35030 }, { "epoch": 18.20945945945946, "grad_norm": 0.8990377187728882, "learning_rate": 1.2115622627089262e-06, "loss": 0.3126, "num_input_tokens_seen": 6678680, "step": 35035 }, { "epoch": 18.212058212058214, "grad_norm": 0.6577978730201721, "learning_rate": 1.2080775356649866e-06, "loss": 0.1744, "num_input_tokens_seen": 6679640, "step": 35040 }, { "epoch": 18.214656964656964, "grad_norm": 0.6526820659637451, "learning_rate": 1.2045977031864581e-06, "loss": 0.139, "num_input_tokens_seen": 6680568, "step": 35045 }, { "epoch": 18.217255717255718, "grad_norm": 0.8893422484397888, "learning_rate": 1.2011227659892239e-06, "loss": 0.0935, "num_input_tokens_seen": 6681432, "step": 35050 }, { "epoch": 18.21985446985447, "grad_norm": 0.7795884013175964, "learning_rate": 1.1976527247881618e-06, "loss": 0.1747, "num_input_tokens_seen": 6682328, "step": 35055 }, { "epoch": 18.222453222453222, "grad_norm": 0.9546017646789551, "learning_rate": 1.194187580297146e-06, "loss": 0.053, "num_input_tokens_seen": 6683192, "step": 35060 }, { "epoch": 18.225051975051976, "grad_norm": 0.7320932745933533, "learning_rate": 1.1907273332290413e-06, "loss": 0.1269, "num_input_tokens_seen": 6684152, "step": 35065 }, { "epoch": 18.227650727650726, "grad_norm": 0.07050728052854538, "learning_rate": 1.1872719842956904e-06, "loss": 0.1277, "num_input_tokens_seen": 6685048, "step": 35070 }, { "epoch": 18.23024948024948, "grad_norm": 0.5820282101631165, "learning_rate": 1.1838215342079516e-06, "loss": 0.2263, "num_input_tokens_seen": 6686008, "step": 35075 }, { "epoch": 18.232848232848234, "grad_norm": 0.27113479375839233, "learning_rate": 1.1803759836756633e-06, "loss": 0.0564, "num_input_tokens_seen": 6686936, "step": 35080 }, { "epoch": 18.235446985446984, "grad_norm": 0.12757709622383118, "learning_rate": 1.1769353334076527e-06, "loss": 0.0649, "num_input_tokens_seen": 6687928, "step": 35085 }, { "epoch": 18.238045738045738, "grad_norm": 0.5822378396987915, "learning_rate": 1.1734995841117508e-06, "loss": 0.2217, "num_input_tokens_seen": 6688888, "step": 35090 }, { "epoch": 18.240644490644492, "grad_norm": 0.6717614531517029, "learning_rate": 1.170068736494767e-06, "loss": 0.058, "num_input_tokens_seen": 6689880, "step": 35095 }, { "epoch": 18.243243243243242, "grad_norm": 0.634887158870697, "learning_rate": 1.1666427912625121e-06, "loss": 0.069, "num_input_tokens_seen": 6690808, "step": 35100 }, { "epoch": 18.245841995841996, "grad_norm": 0.14439789950847626, "learning_rate": 1.1632217491197806e-06, "loss": 0.1601, "num_input_tokens_seen": 6691736, "step": 35105 }, { "epoch": 18.24844074844075, "grad_norm": 1.6732840538024902, "learning_rate": 1.1598056107703675e-06, "loss": 0.22, "num_input_tokens_seen": 6692728, "step": 35110 }, { "epoch": 18.2510395010395, "grad_norm": 0.3489816188812256, "learning_rate": 1.1563943769170527e-06, "loss": 0.1626, "num_input_tokens_seen": 6693688, "step": 35115 }, { "epoch": 18.253638253638254, "grad_norm": 0.8375976085662842, "learning_rate": 1.1529880482616051e-06, "loss": 0.1977, "num_input_tokens_seen": 6694584, "step": 35120 }, { "epoch": 18.256237006237008, "grad_norm": 0.23351922631263733, "learning_rate": 1.1495866255047866e-06, "loss": 0.1621, "num_input_tokens_seen": 6695576, "step": 35125 }, { "epoch": 18.258835758835758, "grad_norm": 0.15016597509384155, "learning_rate": 1.1461901093463595e-06, "loss": 0.1211, "num_input_tokens_seen": 6696504, "step": 35130 }, { "epoch": 18.261434511434512, "grad_norm": 0.2601441740989685, "learning_rate": 1.1427985004850623e-06, "loss": 0.1278, "num_input_tokens_seen": 6697496, "step": 35135 }, { "epoch": 18.264033264033262, "grad_norm": 0.38249361515045166, "learning_rate": 1.1394117996186226e-06, "loss": 0.0934, "num_input_tokens_seen": 6698552, "step": 35140 }, { "epoch": 18.266632016632016, "grad_norm": 0.23117764294147491, "learning_rate": 1.1360300074437808e-06, "loss": 0.1021, "num_input_tokens_seen": 6699448, "step": 35145 }, { "epoch": 18.26923076923077, "grad_norm": 1.0787261724472046, "learning_rate": 1.132653124656241e-06, "loss": 0.1761, "num_input_tokens_seen": 6700376, "step": 35150 }, { "epoch": 18.27182952182952, "grad_norm": 0.33491989970207214, "learning_rate": 1.1292811519507117e-06, "loss": 0.1576, "num_input_tokens_seen": 6701272, "step": 35155 }, { "epoch": 18.274428274428274, "grad_norm": 0.4649440050125122, "learning_rate": 1.1259140900208853e-06, "loss": 0.0579, "num_input_tokens_seen": 6702232, "step": 35160 }, { "epoch": 18.277027027027028, "grad_norm": 0.6603987216949463, "learning_rate": 1.122551939559452e-06, "loss": 0.1029, "num_input_tokens_seen": 6703224, "step": 35165 }, { "epoch": 18.27962577962578, "grad_norm": 0.2950710952281952, "learning_rate": 1.119194701258086e-06, "loss": 0.1056, "num_input_tokens_seen": 6704184, "step": 35170 }, { "epoch": 18.282224532224532, "grad_norm": 0.5097563862800598, "learning_rate": 1.1158423758074465e-06, "loss": 0.0537, "num_input_tokens_seen": 6705112, "step": 35175 }, { "epoch": 18.284823284823286, "grad_norm": 0.8787248134613037, "learning_rate": 1.112494963897187e-06, "loss": 0.1486, "num_input_tokens_seen": 6706136, "step": 35180 }, { "epoch": 18.287422037422036, "grad_norm": 0.8395088911056519, "learning_rate": 1.109152466215954e-06, "loss": 0.1552, "num_input_tokens_seen": 6707128, "step": 35185 }, { "epoch": 18.29002079002079, "grad_norm": 0.14686931669712067, "learning_rate": 1.105814883451378e-06, "loss": 0.1324, "num_input_tokens_seen": 6707992, "step": 35190 }, { "epoch": 18.292619542619544, "grad_norm": 1.302754521369934, "learning_rate": 1.1024822162900767e-06, "loss": 0.1184, "num_input_tokens_seen": 6708952, "step": 35195 }, { "epoch": 18.295218295218294, "grad_norm": 0.038145191967487335, "learning_rate": 1.0991544654176623e-06, "loss": 0.0348, "num_input_tokens_seen": 6709944, "step": 35200 }, { "epoch": 18.29781704781705, "grad_norm": 0.591952383518219, "learning_rate": 1.095831631518729e-06, "loss": 0.078, "num_input_tokens_seen": 6710872, "step": 35205 }, { "epoch": 18.3004158004158, "grad_norm": 0.5151130557060242, "learning_rate": 1.0925137152768633e-06, "loss": 0.1355, "num_input_tokens_seen": 6711800, "step": 35210 }, { "epoch": 18.303014553014552, "grad_norm": 0.26166415214538574, "learning_rate": 1.0892007173746417e-06, "loss": 0.1522, "num_input_tokens_seen": 6712792, "step": 35215 }, { "epoch": 18.305613305613306, "grad_norm": 0.6075745224952698, "learning_rate": 1.0858926384936297e-06, "loss": 0.0954, "num_input_tokens_seen": 6713752, "step": 35220 }, { "epoch": 18.308212058212057, "grad_norm": 2.234933853149414, "learning_rate": 1.0825894793143721e-06, "loss": 0.1639, "num_input_tokens_seen": 6714744, "step": 35225 }, { "epoch": 18.31081081081081, "grad_norm": 0.7017157077789307, "learning_rate": 1.0792912405164084e-06, "loss": 0.0982, "num_input_tokens_seen": 6715672, "step": 35230 }, { "epoch": 18.313409563409564, "grad_norm": 0.8288227915763855, "learning_rate": 1.0759979227782658e-06, "loss": 0.1747, "num_input_tokens_seen": 6716600, "step": 35235 }, { "epoch": 18.316008316008315, "grad_norm": 0.7281321883201599, "learning_rate": 1.072709526777463e-06, "loss": 0.1047, "num_input_tokens_seen": 6717560, "step": 35240 }, { "epoch": 18.31860706860707, "grad_norm": 0.6686921715736389, "learning_rate": 1.0694260531904948e-06, "loss": 0.1662, "num_input_tokens_seen": 6718584, "step": 35245 }, { "epoch": 18.321205821205822, "grad_norm": 1.407353162765503, "learning_rate": 1.0661475026928492e-06, "loss": 0.0789, "num_input_tokens_seen": 6719576, "step": 35250 }, { "epoch": 18.323804573804573, "grad_norm": 0.34805870056152344, "learning_rate": 1.0628738759590084e-06, "loss": 0.195, "num_input_tokens_seen": 6720504, "step": 35255 }, { "epoch": 18.326403326403327, "grad_norm": 0.27752843499183655, "learning_rate": 1.0596051736624367e-06, "loss": 0.2091, "num_input_tokens_seen": 6721528, "step": 35260 }, { "epoch": 18.32900207900208, "grad_norm": 0.6933026313781738, "learning_rate": 1.0563413964755742e-06, "loss": 0.1224, "num_input_tokens_seen": 6722520, "step": 35265 }, { "epoch": 18.33160083160083, "grad_norm": 0.12225916981697083, "learning_rate": 1.053082545069864e-06, "loss": 0.1774, "num_input_tokens_seen": 6723384, "step": 35270 }, { "epoch": 18.334199584199585, "grad_norm": 0.09200182557106018, "learning_rate": 1.0498286201157281e-06, "loss": 0.117, "num_input_tokens_seen": 6724248, "step": 35275 }, { "epoch": 18.33679833679834, "grad_norm": 0.23763109743595123, "learning_rate": 1.0465796222825785e-06, "loss": 0.2396, "num_input_tokens_seen": 6725208, "step": 35280 }, { "epoch": 18.33939708939709, "grad_norm": 1.0306880474090576, "learning_rate": 1.0433355522388083e-06, "loss": 0.2517, "num_input_tokens_seen": 6726232, "step": 35285 }, { "epoch": 18.341995841995843, "grad_norm": 1.1129087209701538, "learning_rate": 1.040096410651803e-06, "loss": 0.0882, "num_input_tokens_seen": 6727288, "step": 35290 }, { "epoch": 18.344594594594593, "grad_norm": 0.8319694399833679, "learning_rate": 1.0368621981879296e-06, "loss": 0.1484, "num_input_tokens_seen": 6728216, "step": 35295 }, { "epoch": 18.347193347193347, "grad_norm": 0.8518783450126648, "learning_rate": 1.0336329155125418e-06, "loss": 0.1742, "num_input_tokens_seen": 6729176, "step": 35300 }, { "epoch": 18.3497920997921, "grad_norm": 0.6063458323478699, "learning_rate": 1.0304085632899779e-06, "loss": 0.1338, "num_input_tokens_seen": 6730136, "step": 35305 }, { "epoch": 18.35239085239085, "grad_norm": 0.9340512156486511, "learning_rate": 1.027189142183574e-06, "loss": 0.291, "num_input_tokens_seen": 6731096, "step": 35310 }, { "epoch": 18.354989604989605, "grad_norm": 1.1868482828140259, "learning_rate": 1.0239746528556303e-06, "loss": 0.2252, "num_input_tokens_seen": 6732056, "step": 35315 }, { "epoch": 18.35758835758836, "grad_norm": 0.12976250052452087, "learning_rate": 1.0207650959674492e-06, "loss": 0.1381, "num_input_tokens_seen": 6733080, "step": 35320 }, { "epoch": 18.36018711018711, "grad_norm": 0.16990217566490173, "learning_rate": 1.0175604721793158e-06, "loss": 0.1287, "num_input_tokens_seen": 6733944, "step": 35325 }, { "epoch": 18.362785862785863, "grad_norm": 0.2693486511707306, "learning_rate": 1.014360782150492e-06, "loss": 0.0873, "num_input_tokens_seen": 6734840, "step": 35330 }, { "epoch": 18.365384615384617, "grad_norm": 0.2128603607416153, "learning_rate": 1.0111660265392287e-06, "loss": 0.0695, "num_input_tokens_seen": 6735768, "step": 35335 }, { "epoch": 18.367983367983367, "grad_norm": 0.7057995796203613, "learning_rate": 1.0079762060027698e-06, "loss": 0.0786, "num_input_tokens_seen": 6736696, "step": 35340 }, { "epoch": 18.37058212058212, "grad_norm": 0.8628451228141785, "learning_rate": 1.0047913211973376e-06, "loss": 0.1621, "num_input_tokens_seen": 6737592, "step": 35345 }, { "epoch": 18.373180873180875, "grad_norm": 0.1907966136932373, "learning_rate": 1.0016113727781328e-06, "loss": 0.0567, "num_input_tokens_seen": 6738648, "step": 35350 }, { "epoch": 18.375779625779625, "grad_norm": 0.3090057969093323, "learning_rate": 9.984363613993463e-07, "loss": 0.1294, "num_input_tokens_seen": 6739672, "step": 35355 }, { "epoch": 18.37837837837838, "grad_norm": 0.18182703852653503, "learning_rate": 9.95266287714161e-07, "loss": 0.0834, "num_input_tokens_seen": 6740664, "step": 35360 }, { "epoch": 18.38097713097713, "grad_norm": 0.13908091187477112, "learning_rate": 9.92101152374733e-07, "loss": 0.1138, "num_input_tokens_seen": 6741560, "step": 35365 }, { "epoch": 18.383575883575883, "grad_norm": 0.08331669121980667, "learning_rate": 9.889409560322028e-07, "loss": 0.1275, "num_input_tokens_seen": 6742552, "step": 35370 }, { "epoch": 18.386174636174637, "grad_norm": 0.19350610673427582, "learning_rate": 9.857856993366999e-07, "loss": 0.0316, "num_input_tokens_seen": 6743480, "step": 35375 }, { "epoch": 18.388773388773387, "grad_norm": 1.2211591005325317, "learning_rate": 9.826353829373387e-07, "loss": 0.1322, "num_input_tokens_seen": 6744408, "step": 35380 }, { "epoch": 18.39137214137214, "grad_norm": 1.2036242485046387, "learning_rate": 9.794900074822143e-07, "loss": 0.2115, "num_input_tokens_seen": 6745368, "step": 35385 }, { "epoch": 18.393970893970895, "grad_norm": 0.9932457804679871, "learning_rate": 9.763495736184014e-07, "loss": 0.2883, "num_input_tokens_seen": 6746264, "step": 35390 }, { "epoch": 18.396569646569645, "grad_norm": 1.6719099283218384, "learning_rate": 9.73214081991966e-07, "loss": 0.222, "num_input_tokens_seen": 6747320, "step": 35395 }, { "epoch": 18.3991683991684, "grad_norm": 0.8600174188613892, "learning_rate": 9.70083533247948e-07, "loss": 0.1149, "num_input_tokens_seen": 6748216, "step": 35400 }, { "epoch": 18.401767151767153, "grad_norm": 0.48961952328681946, "learning_rate": 9.66957928030382e-07, "loss": 0.0902, "num_input_tokens_seen": 6749208, "step": 35405 }, { "epoch": 18.404365904365903, "grad_norm": 0.07843344658613205, "learning_rate": 9.638372669822733e-07, "loss": 0.083, "num_input_tokens_seen": 6750136, "step": 35410 }, { "epoch": 18.406964656964657, "grad_norm": 1.6976927518844604, "learning_rate": 9.607215507456247e-07, "loss": 0.1334, "num_input_tokens_seen": 6751064, "step": 35415 }, { "epoch": 18.40956340956341, "grad_norm": 0.7968128323554993, "learning_rate": 9.576107799614042e-07, "loss": 0.1866, "num_input_tokens_seen": 6751992, "step": 35420 }, { "epoch": 18.41216216216216, "grad_norm": 1.2301188707351685, "learning_rate": 9.545049552695805e-07, "loss": 0.2215, "num_input_tokens_seen": 6752952, "step": 35425 }, { "epoch": 18.414760914760915, "grad_norm": 0.08385614305734634, "learning_rate": 9.514040773090843e-07, "loss": 0.0801, "num_input_tokens_seen": 6753944, "step": 35430 }, { "epoch": 18.41735966735967, "grad_norm": 0.547569215297699, "learning_rate": 9.483081467178523e-07, "loss": 0.1077, "num_input_tokens_seen": 6754904, "step": 35435 }, { "epoch": 18.41995841995842, "grad_norm": 0.14091144502162933, "learning_rate": 9.452171641327834e-07, "loss": 0.0793, "num_input_tokens_seen": 6755832, "step": 35440 }, { "epoch": 18.422557172557173, "grad_norm": 1.079918384552002, "learning_rate": 9.421311301897662e-07, "loss": 0.1457, "num_input_tokens_seen": 6756824, "step": 35445 }, { "epoch": 18.425155925155924, "grad_norm": 0.41494908928871155, "learning_rate": 9.390500455236762e-07, "loss": 0.1325, "num_input_tokens_seen": 6757752, "step": 35450 }, { "epoch": 18.427754677754677, "grad_norm": 0.128944531083107, "learning_rate": 9.359739107683673e-07, "loss": 0.1068, "num_input_tokens_seen": 6758776, "step": 35455 }, { "epoch": 18.43035343035343, "grad_norm": 0.1971791386604309, "learning_rate": 9.329027265566643e-07, "loss": 0.1114, "num_input_tokens_seen": 6759768, "step": 35460 }, { "epoch": 18.43295218295218, "grad_norm": 0.15509219467639923, "learning_rate": 9.298364935203918e-07, "loss": 0.0304, "num_input_tokens_seen": 6760696, "step": 35465 }, { "epoch": 18.435550935550935, "grad_norm": 0.1115587055683136, "learning_rate": 9.267752122903428e-07, "loss": 0.0956, "num_input_tokens_seen": 6761592, "step": 35470 }, { "epoch": 18.43814968814969, "grad_norm": 0.1317831575870514, "learning_rate": 9.237188834962995e-07, "loss": 0.1407, "num_input_tokens_seen": 6762584, "step": 35475 }, { "epoch": 18.44074844074844, "grad_norm": 1.0991601943969727, "learning_rate": 9.206675077670146e-07, "loss": 0.1789, "num_input_tokens_seen": 6763576, "step": 35480 }, { "epoch": 18.443347193347194, "grad_norm": 1.5105153322219849, "learning_rate": 9.176210857302331e-07, "loss": 0.1217, "num_input_tokens_seen": 6764504, "step": 35485 }, { "epoch": 18.445945945945947, "grad_norm": 1.0664535760879517, "learning_rate": 9.145796180126814e-07, "loss": 0.0877, "num_input_tokens_seen": 6765432, "step": 35490 }, { "epoch": 18.448544698544698, "grad_norm": 1.834315299987793, "learning_rate": 9.115431052400536e-07, "loss": 0.2149, "num_input_tokens_seen": 6766360, "step": 35495 }, { "epoch": 18.45114345114345, "grad_norm": 2.9714195728302, "learning_rate": 9.085115480370332e-07, "loss": 0.1786, "num_input_tokens_seen": 6767288, "step": 35500 }, { "epoch": 18.453742203742205, "grad_norm": 0.7541099786758423, "learning_rate": 9.054849470272909e-07, "loss": 0.1112, "num_input_tokens_seen": 6768248, "step": 35505 }, { "epoch": 18.456340956340956, "grad_norm": 0.5168450474739075, "learning_rate": 9.024633028334673e-07, "loss": 0.1077, "num_input_tokens_seen": 6769176, "step": 35510 }, { "epoch": 18.45893970893971, "grad_norm": 0.8205936551094055, "learning_rate": 8.994466160771847e-07, "loss": 0.1414, "num_input_tokens_seen": 6770104, "step": 35515 }, { "epoch": 18.46153846153846, "grad_norm": 1.0663236379623413, "learning_rate": 8.964348873790496e-07, "loss": 0.1661, "num_input_tokens_seen": 6771096, "step": 35520 }, { "epoch": 18.464137214137214, "grad_norm": 0.2592000365257263, "learning_rate": 8.934281173586467e-07, "loss": 0.1094, "num_input_tokens_seen": 6772120, "step": 35525 }, { "epoch": 18.466735966735968, "grad_norm": 0.36918550729751587, "learning_rate": 8.90426306634537e-07, "loss": 0.0814, "num_input_tokens_seen": 6773080, "step": 35530 }, { "epoch": 18.469334719334718, "grad_norm": 0.4832260012626648, "learning_rate": 8.874294558242624e-07, "loss": 0.0557, "num_input_tokens_seen": 6774008, "step": 35535 }, { "epoch": 18.471933471933472, "grad_norm": 0.6961358189582825, "learning_rate": 8.844375655443549e-07, "loss": 0.1897, "num_input_tokens_seen": 6775000, "step": 35540 }, { "epoch": 18.474532224532226, "grad_norm": 0.1415436863899231, "learning_rate": 8.814506364103137e-07, "loss": 0.1654, "num_input_tokens_seen": 6775992, "step": 35545 }, { "epoch": 18.477130977130976, "grad_norm": 0.09430145472288132, "learning_rate": 8.784686690366223e-07, "loss": 0.0944, "num_input_tokens_seen": 6776952, "step": 35550 }, { "epoch": 18.47972972972973, "grad_norm": 0.1676550954580307, "learning_rate": 8.754916640367344e-07, "loss": 0.1247, "num_input_tokens_seen": 6777944, "step": 35555 }, { "epoch": 18.482328482328484, "grad_norm": 1.2668815851211548, "learning_rate": 8.725196220231019e-07, "loss": 0.1119, "num_input_tokens_seen": 6778904, "step": 35560 }, { "epoch": 18.484927234927234, "grad_norm": 0.43187710642814636, "learning_rate": 8.695525436071412e-07, "loss": 0.1252, "num_input_tokens_seen": 6779960, "step": 35565 }, { "epoch": 18.487525987525988, "grad_norm": 0.3947291076183319, "learning_rate": 8.665904293992472e-07, "loss": 0.1892, "num_input_tokens_seen": 6780888, "step": 35570 }, { "epoch": 18.49012474012474, "grad_norm": 0.8794611692428589, "learning_rate": 8.63633280008802e-07, "loss": 0.1668, "num_input_tokens_seen": 6781816, "step": 35575 }, { "epoch": 18.492723492723492, "grad_norm": 0.28875353932380676, "learning_rate": 8.606810960441608e-07, "loss": 0.0851, "num_input_tokens_seen": 6782808, "step": 35580 }, { "epoch": 18.495322245322246, "grad_norm": 0.612082302570343, "learning_rate": 8.57733878112657e-07, "loss": 0.1084, "num_input_tokens_seen": 6783832, "step": 35585 }, { "epoch": 18.497920997920996, "grad_norm": 0.16239456832408905, "learning_rate": 8.547916268206058e-07, "loss": 0.1017, "num_input_tokens_seen": 6784728, "step": 35590 }, { "epoch": 18.50051975051975, "grad_norm": 0.8814169764518738, "learning_rate": 8.51854342773295e-07, "loss": 0.0791, "num_input_tokens_seen": 6785688, "step": 35595 }, { "epoch": 18.503118503118504, "grad_norm": 0.5908525586128235, "learning_rate": 8.489220265749942e-07, "loss": 0.1341, "num_input_tokens_seen": 6786680, "step": 35600 }, { "epoch": 18.505717255717254, "grad_norm": 0.2107383906841278, "learning_rate": 8.459946788289513e-07, "loss": 0.1342, "num_input_tokens_seen": 6787576, "step": 35605 }, { "epoch": 18.508316008316008, "grad_norm": 0.0885596051812172, "learning_rate": 8.43072300137393e-07, "loss": 0.1567, "num_input_tokens_seen": 6788504, "step": 35610 }, { "epoch": 18.510914760914762, "grad_norm": 0.3539046347141266, "learning_rate": 8.401548911015245e-07, "loss": 0.0969, "num_input_tokens_seen": 6789432, "step": 35615 }, { "epoch": 18.513513513513512, "grad_norm": 0.5815295577049255, "learning_rate": 8.372424523215211e-07, "loss": 0.0782, "num_input_tokens_seen": 6790328, "step": 35620 }, { "epoch": 18.516112266112266, "grad_norm": 0.5314905047416687, "learning_rate": 8.343349843965398e-07, "loss": 0.0765, "num_input_tokens_seen": 6791224, "step": 35625 }, { "epoch": 18.51871101871102, "grad_norm": 0.25064384937286377, "learning_rate": 8.314324879247243e-07, "loss": 0.0774, "num_input_tokens_seen": 6792216, "step": 35630 }, { "epoch": 18.52130977130977, "grad_norm": 0.6719711422920227, "learning_rate": 8.285349635031803e-07, "loss": 0.1545, "num_input_tokens_seen": 6793144, "step": 35635 }, { "epoch": 18.523908523908524, "grad_norm": 0.2509406507015228, "learning_rate": 8.256424117280003e-07, "loss": 0.111, "num_input_tokens_seen": 6794072, "step": 35640 }, { "epoch": 18.526507276507278, "grad_norm": 0.7577064037322998, "learning_rate": 8.227548331942475e-07, "loss": 0.1073, "num_input_tokens_seen": 6795000, "step": 35645 }, { "epoch": 18.52910602910603, "grad_norm": 0.3811393082141876, "learning_rate": 8.198722284959742e-07, "loss": 0.1047, "num_input_tokens_seen": 6795928, "step": 35650 }, { "epoch": 18.531704781704782, "grad_norm": 0.1333322823047638, "learning_rate": 8.169945982261923e-07, "loss": 0.072, "num_input_tokens_seen": 6796856, "step": 35655 }, { "epoch": 18.534303534303533, "grad_norm": 1.0832788944244385, "learning_rate": 8.141219429768975e-07, "loss": 0.0942, "num_input_tokens_seen": 6797784, "step": 35660 }, { "epoch": 18.536902286902286, "grad_norm": 0.5452293157577515, "learning_rate": 8.11254263339073e-07, "loss": 0.1594, "num_input_tokens_seen": 6798744, "step": 35665 }, { "epoch": 18.53950103950104, "grad_norm": 0.853092610836029, "learning_rate": 8.083915599026632e-07, "loss": 0.1037, "num_input_tokens_seen": 6799704, "step": 35670 }, { "epoch": 18.54209979209979, "grad_norm": 1.4362159967422485, "learning_rate": 8.055338332565943e-07, "loss": 0.113, "num_input_tokens_seen": 6800664, "step": 35675 }, { "epoch": 18.544698544698544, "grad_norm": 0.3834075927734375, "learning_rate": 8.026810839887655e-07, "loss": 0.1504, "num_input_tokens_seen": 6801656, "step": 35680 }, { "epoch": 18.5472972972973, "grad_norm": 0.3098163902759552, "learning_rate": 7.99833312686063e-07, "loss": 0.0509, "num_input_tokens_seen": 6802616, "step": 35685 }, { "epoch": 18.54989604989605, "grad_norm": 0.05062355101108551, "learning_rate": 7.969905199343374e-07, "loss": 0.0704, "num_input_tokens_seen": 6803608, "step": 35690 }, { "epoch": 18.552494802494802, "grad_norm": 0.22833558917045593, "learning_rate": 7.941527063184157e-07, "loss": 0.1752, "num_input_tokens_seen": 6804440, "step": 35695 }, { "epoch": 18.555093555093556, "grad_norm": 1.8629279136657715, "learning_rate": 7.913198724221111e-07, "loss": 0.171, "num_input_tokens_seen": 6805400, "step": 35700 }, { "epoch": 18.557692307692307, "grad_norm": 0.1828492283821106, "learning_rate": 7.884920188281991e-07, "loss": 0.1036, "num_input_tokens_seen": 6806328, "step": 35705 }, { "epoch": 18.56029106029106, "grad_norm": 0.18058666586875916, "learning_rate": 7.856691461184396e-07, "loss": 0.2041, "num_input_tokens_seen": 6807288, "step": 35710 }, { "epoch": 18.562889812889814, "grad_norm": 0.19252943992614746, "learning_rate": 7.828512548735623e-07, "loss": 0.0582, "num_input_tokens_seen": 6808248, "step": 35715 }, { "epoch": 18.565488565488565, "grad_norm": 0.20367328822612762, "learning_rate": 7.800383456732757e-07, "loss": 0.0213, "num_input_tokens_seen": 6809208, "step": 35720 }, { "epoch": 18.56808731808732, "grad_norm": 1.7300870418548584, "learning_rate": 7.772304190962643e-07, "loss": 0.1544, "num_input_tokens_seen": 6810168, "step": 35725 }, { "epoch": 18.570686070686072, "grad_norm": 0.6503546833992004, "learning_rate": 7.744274757201769e-07, "loss": 0.091, "num_input_tokens_seen": 6811128, "step": 35730 }, { "epoch": 18.573284823284823, "grad_norm": 0.267092764377594, "learning_rate": 7.71629516121658e-07, "loss": 0.0725, "num_input_tokens_seen": 6812088, "step": 35735 }, { "epoch": 18.575883575883577, "grad_norm": 0.9257805943489075, "learning_rate": 7.68836540876311e-07, "loss": 0.277, "num_input_tokens_seen": 6812984, "step": 35740 }, { "epoch": 18.578482328482327, "grad_norm": 0.29400816559791565, "learning_rate": 7.660485505587122e-07, "loss": 0.106, "num_input_tokens_seen": 6814072, "step": 35745 }, { "epoch": 18.58108108108108, "grad_norm": 0.4690952003002167, "learning_rate": 7.632655457424198e-07, "loss": 0.0755, "num_input_tokens_seen": 6815032, "step": 35750 }, { "epoch": 18.583679833679835, "grad_norm": 0.24025210738182068, "learning_rate": 7.604875269999673e-07, "loss": 0.14, "num_input_tokens_seen": 6815928, "step": 35755 }, { "epoch": 18.586278586278585, "grad_norm": 1.2329438924789429, "learning_rate": 7.577144949028558e-07, "loss": 0.1959, "num_input_tokens_seen": 6816920, "step": 35760 }, { "epoch": 18.58887733887734, "grad_norm": 1.190994143486023, "learning_rate": 7.54946450021568e-07, "loss": 0.1326, "num_input_tokens_seen": 6817912, "step": 35765 }, { "epoch": 18.591476091476093, "grad_norm": 0.38441845774650574, "learning_rate": 7.521833929255512e-07, "loss": 0.0497, "num_input_tokens_seen": 6818872, "step": 35770 }, { "epoch": 18.594074844074843, "grad_norm": 0.4651185870170593, "learning_rate": 7.494253241832367e-07, "loss": 0.1641, "num_input_tokens_seen": 6819896, "step": 35775 }, { "epoch": 18.596673596673597, "grad_norm": 0.38559141755104065, "learning_rate": 7.466722443620261e-07, "loss": 0.1163, "num_input_tokens_seen": 6820856, "step": 35780 }, { "epoch": 18.59927234927235, "grad_norm": 0.8461673855781555, "learning_rate": 7.439241540282887e-07, "loss": 0.0774, "num_input_tokens_seen": 6821816, "step": 35785 }, { "epoch": 18.6018711018711, "grad_norm": 1.14404296875, "learning_rate": 7.411810537473752e-07, "loss": 0.1894, "num_input_tokens_seen": 6822712, "step": 35790 }, { "epoch": 18.604469854469855, "grad_norm": 0.41272851824760437, "learning_rate": 7.38442944083606e-07, "loss": 0.0795, "num_input_tokens_seen": 6823640, "step": 35795 }, { "epoch": 18.60706860706861, "grad_norm": 0.4073035418987274, "learning_rate": 7.357098256002726e-07, "loss": 0.0789, "num_input_tokens_seen": 6824632, "step": 35800 }, { "epoch": 18.60966735966736, "grad_norm": 0.9848169088363647, "learning_rate": 7.329816988596472e-07, "loss": 0.0881, "num_input_tokens_seen": 6825624, "step": 35805 }, { "epoch": 18.612266112266113, "grad_norm": 0.741853654384613, "learning_rate": 7.30258564422967e-07, "loss": 0.2558, "num_input_tokens_seen": 6826616, "step": 35810 }, { "epoch": 18.614864864864863, "grad_norm": 0.06466393172740936, "learning_rate": 7.275404228504506e-07, "loss": 0.0548, "num_input_tokens_seen": 6827608, "step": 35815 }, { "epoch": 18.617463617463617, "grad_norm": 0.06918337196111679, "learning_rate": 7.248272747012786e-07, "loss": 0.172, "num_input_tokens_seen": 6828632, "step": 35820 }, { "epoch": 18.62006237006237, "grad_norm": 0.12794576585292816, "learning_rate": 7.221191205336125e-07, "loss": 0.2392, "num_input_tokens_seen": 6829528, "step": 35825 }, { "epoch": 18.62266112266112, "grad_norm": 0.24283269047737122, "learning_rate": 7.194159609045875e-07, "loss": 0.077, "num_input_tokens_seen": 6830456, "step": 35830 }, { "epoch": 18.625259875259875, "grad_norm": 0.392903596162796, "learning_rate": 7.167177963703031e-07, "loss": 0.1063, "num_input_tokens_seen": 6831416, "step": 35835 }, { "epoch": 18.62785862785863, "grad_norm": 0.7632527947425842, "learning_rate": 7.140246274858347e-07, "loss": 0.1018, "num_input_tokens_seen": 6832280, "step": 35840 }, { "epoch": 18.63045738045738, "grad_norm": 0.47722962498664856, "learning_rate": 7.113364548052392e-07, "loss": 0.0969, "num_input_tokens_seen": 6833272, "step": 35845 }, { "epoch": 18.633056133056133, "grad_norm": 0.24316850304603577, "learning_rate": 7.086532788815354e-07, "loss": 0.1185, "num_input_tokens_seen": 6834264, "step": 35850 }, { "epoch": 18.635654885654887, "grad_norm": 1.5668035745620728, "learning_rate": 7.059751002667092e-07, "loss": 0.1952, "num_input_tokens_seen": 6835256, "step": 35855 }, { "epoch": 18.638253638253637, "grad_norm": 0.5483792424201965, "learning_rate": 7.033019195117313e-07, "loss": 0.0847, "num_input_tokens_seen": 6836216, "step": 35860 }, { "epoch": 18.64085239085239, "grad_norm": 0.24557223916053772, "learning_rate": 7.006337371665395e-07, "loss": 0.1113, "num_input_tokens_seen": 6837208, "step": 35865 }, { "epoch": 18.643451143451145, "grad_norm": 1.2817943096160889, "learning_rate": 6.979705537800418e-07, "loss": 0.1506, "num_input_tokens_seen": 6838104, "step": 35870 }, { "epoch": 18.646049896049895, "grad_norm": 0.6832278966903687, "learning_rate": 6.953123699001141e-07, "loss": 0.0612, "num_input_tokens_seen": 6839032, "step": 35875 }, { "epoch": 18.64864864864865, "grad_norm": 1.1023038625717163, "learning_rate": 6.926591860736131e-07, "loss": 0.1143, "num_input_tokens_seen": 6839960, "step": 35880 }, { "epoch": 18.651247401247403, "grad_norm": 0.9261643290519714, "learning_rate": 6.900110028463635e-07, "loss": 0.2283, "num_input_tokens_seen": 6840920, "step": 35885 }, { "epoch": 18.653846153846153, "grad_norm": 0.4507160186767578, "learning_rate": 6.873678207631573e-07, "loss": 0.3054, "num_input_tokens_seen": 6841848, "step": 35890 }, { "epoch": 18.656444906444907, "grad_norm": 0.9425644874572754, "learning_rate": 6.847296403677539e-07, "loss": 0.1143, "num_input_tokens_seen": 6842680, "step": 35895 }, { "epoch": 18.659043659043657, "grad_norm": 0.6272066831588745, "learning_rate": 6.820964622028997e-07, "loss": 0.1521, "num_input_tokens_seen": 6843672, "step": 35900 }, { "epoch": 18.66164241164241, "grad_norm": 0.11146663129329681, "learning_rate": 6.794682868103003e-07, "loss": 0.1616, "num_input_tokens_seen": 6844600, "step": 35905 }, { "epoch": 18.664241164241165, "grad_norm": 0.9887254238128662, "learning_rate": 6.768451147306287e-07, "loss": 0.0529, "num_input_tokens_seen": 6845528, "step": 35910 }, { "epoch": 18.666839916839916, "grad_norm": 0.17362691462039948, "learning_rate": 6.742269465035394e-07, "loss": 0.0679, "num_input_tokens_seen": 6846360, "step": 35915 }, { "epoch": 18.66943866943867, "grad_norm": 0.8631508946418762, "learning_rate": 6.716137826676461e-07, "loss": 0.1611, "num_input_tokens_seen": 6847288, "step": 35920 }, { "epoch": 18.672037422037423, "grad_norm": 0.6566083431243896, "learning_rate": 6.690056237605464e-07, "loss": 0.127, "num_input_tokens_seen": 6848216, "step": 35925 }, { "epoch": 18.674636174636174, "grad_norm": 0.4183866083621979, "learning_rate": 6.664024703187916e-07, "loss": 0.1934, "num_input_tokens_seen": 6849176, "step": 35930 }, { "epoch": 18.677234927234927, "grad_norm": 0.7154350876808167, "learning_rate": 6.638043228779229e-07, "loss": 0.0716, "num_input_tokens_seen": 6850104, "step": 35935 }, { "epoch": 18.67983367983368, "grad_norm": 0.16490621864795685, "learning_rate": 6.612111819724321e-07, "loss": 0.1106, "num_input_tokens_seen": 6851096, "step": 35940 }, { "epoch": 18.68243243243243, "grad_norm": 0.7999932765960693, "learning_rate": 6.586230481357953e-07, "loss": 0.0631, "num_input_tokens_seen": 6852024, "step": 35945 }, { "epoch": 18.685031185031185, "grad_norm": 0.4639669358730316, "learning_rate": 6.560399219004504e-07, "loss": 0.066, "num_input_tokens_seen": 6852952, "step": 35950 }, { "epoch": 18.68762993762994, "grad_norm": 0.7558088302612305, "learning_rate": 6.534618037978113e-07, "loss": 0.1836, "num_input_tokens_seen": 6853880, "step": 35955 }, { "epoch": 18.69022869022869, "grad_norm": 0.2721492350101471, "learning_rate": 6.508886943582565e-07, "loss": 0.1074, "num_input_tokens_seen": 6854776, "step": 35960 }, { "epoch": 18.692827442827443, "grad_norm": 0.999947190284729, "learning_rate": 6.483205941111348e-07, "loss": 0.1215, "num_input_tokens_seen": 6855736, "step": 35965 }, { "epoch": 18.695426195426194, "grad_norm": 1.102500557899475, "learning_rate": 6.457575035847652e-07, "loss": 0.1048, "num_input_tokens_seen": 6856664, "step": 35970 }, { "epoch": 18.698024948024948, "grad_norm": 0.9835382103919983, "learning_rate": 6.431994233064426e-07, "loss": 0.2111, "num_input_tokens_seen": 6857688, "step": 35975 }, { "epoch": 18.7006237006237, "grad_norm": 0.12633326649665833, "learning_rate": 6.406463538024182e-07, "loss": 0.0357, "num_input_tokens_seen": 6858616, "step": 35980 }, { "epoch": 18.703222453222452, "grad_norm": 1.101314663887024, "learning_rate": 6.380982955979192e-07, "loss": 0.2537, "num_input_tokens_seen": 6859512, "step": 35985 }, { "epoch": 18.705821205821206, "grad_norm": 1.1076287031173706, "learning_rate": 6.355552492171485e-07, "loss": 0.0737, "num_input_tokens_seen": 6860376, "step": 35990 }, { "epoch": 18.70841995841996, "grad_norm": 1.567333698272705, "learning_rate": 6.330172151832653e-07, "loss": 0.1429, "num_input_tokens_seen": 6861368, "step": 35995 }, { "epoch": 18.71101871101871, "grad_norm": 0.05499741807579994, "learning_rate": 6.30484194018402e-07, "loss": 0.145, "num_input_tokens_seen": 6862264, "step": 36000 }, { "epoch": 18.713617463617464, "grad_norm": 0.06511212140321732, "learning_rate": 6.279561862436667e-07, "loss": 0.1115, "num_input_tokens_seen": 6863192, "step": 36005 }, { "epoch": 18.716216216216218, "grad_norm": 0.32830286026000977, "learning_rate": 6.254331923791324e-07, "loss": 0.0731, "num_input_tokens_seen": 6864184, "step": 36010 }, { "epoch": 18.718814968814968, "grad_norm": 1.7864383459091187, "learning_rate": 6.229152129438309e-07, "loss": 0.1116, "num_input_tokens_seen": 6865144, "step": 36015 }, { "epoch": 18.72141372141372, "grad_norm": 0.8473159670829773, "learning_rate": 6.204022484557759e-07, "loss": 0.13, "num_input_tokens_seen": 6866072, "step": 36020 }, { "epoch": 18.724012474012476, "grad_norm": 0.16100716590881348, "learning_rate": 6.178942994319453e-07, "loss": 0.1272, "num_input_tokens_seen": 6867000, "step": 36025 }, { "epoch": 18.726611226611226, "grad_norm": 0.8962586522102356, "learning_rate": 6.153913663882821e-07, "loss": 0.1524, "num_input_tokens_seen": 6867896, "step": 36030 }, { "epoch": 18.72920997920998, "grad_norm": 0.38383302092552185, "learning_rate": 6.128934498396993e-07, "loss": 0.105, "num_input_tokens_seen": 6868856, "step": 36035 }, { "epoch": 18.731808731808734, "grad_norm": 1.5857317447662354, "learning_rate": 6.104005503000776e-07, "loss": 0.2882, "num_input_tokens_seen": 6869816, "step": 36040 }, { "epoch": 18.734407484407484, "grad_norm": 1.3479599952697754, "learning_rate": 6.079126682822677e-07, "loss": 0.2219, "num_input_tokens_seen": 6870680, "step": 36045 }, { "epoch": 18.737006237006238, "grad_norm": 1.0396777391433716, "learning_rate": 6.054298042980827e-07, "loss": 0.1363, "num_input_tokens_seen": 6871640, "step": 36050 }, { "epoch": 18.739604989604988, "grad_norm": 1.106685757637024, "learning_rate": 6.029519588583082e-07, "loss": 0.1128, "num_input_tokens_seen": 6872568, "step": 36055 }, { "epoch": 18.742203742203742, "grad_norm": 0.26330745220184326, "learning_rate": 6.004791324726978e-07, "loss": 0.183, "num_input_tokens_seen": 6873528, "step": 36060 }, { "epoch": 18.744802494802496, "grad_norm": 1.6056687831878662, "learning_rate": 5.980113256499719e-07, "loss": 0.1227, "num_input_tokens_seen": 6874488, "step": 36065 }, { "epoch": 18.747401247401246, "grad_norm": 0.8184874057769775, "learning_rate": 5.955485388978083e-07, "loss": 0.1636, "num_input_tokens_seen": 6875512, "step": 36070 }, { "epoch": 18.75, "grad_norm": 0.8759660720825195, "learning_rate": 5.930907727228734e-07, "loss": 0.1597, "num_input_tokens_seen": 6876440, "step": 36075 }, { "epoch": 18.752598752598754, "grad_norm": 0.70203697681427, "learning_rate": 5.906380276307827e-07, "loss": 0.1287, "num_input_tokens_seen": 6877464, "step": 36080 }, { "epoch": 18.755197505197504, "grad_norm": 0.5949529409408569, "learning_rate": 5.881903041261211e-07, "loss": 0.0434, "num_input_tokens_seen": 6878424, "step": 36085 }, { "epoch": 18.757796257796258, "grad_norm": 0.39249008893966675, "learning_rate": 5.85747602712447e-07, "loss": 0.0978, "num_input_tokens_seen": 6879352, "step": 36090 }, { "epoch": 18.760395010395012, "grad_norm": 0.3533625602722168, "learning_rate": 5.833099238922834e-07, "loss": 0.2583, "num_input_tokens_seen": 6880280, "step": 36095 }, { "epoch": 18.762993762993762, "grad_norm": 0.2742450535297394, "learning_rate": 5.808772681671182e-07, "loss": 0.0806, "num_input_tokens_seen": 6881240, "step": 36100 }, { "epoch": 18.765592515592516, "grad_norm": 0.47228604555130005, "learning_rate": 5.784496360374037e-07, "loss": 0.0877, "num_input_tokens_seen": 6882168, "step": 36105 }, { "epoch": 18.768191268191266, "grad_norm": 1.0581871271133423, "learning_rate": 5.760270280025653e-07, "loss": 0.0932, "num_input_tokens_seen": 6883128, "step": 36110 }, { "epoch": 18.77079002079002, "grad_norm": 0.5880107283592224, "learning_rate": 5.736094445609907e-07, "loss": 0.1874, "num_input_tokens_seen": 6884056, "step": 36115 }, { "epoch": 18.773388773388774, "grad_norm": 0.15607212483882904, "learning_rate": 5.711968862100319e-07, "loss": 0.1374, "num_input_tokens_seen": 6885016, "step": 36120 }, { "epoch": 18.775987525987524, "grad_norm": 0.8467894196510315, "learning_rate": 5.687893534460087e-07, "loss": 0.1625, "num_input_tokens_seen": 6885944, "step": 36125 }, { "epoch": 18.77858627858628, "grad_norm": 0.3785181939601898, "learning_rate": 5.663868467642136e-07, "loss": 0.1153, "num_input_tokens_seen": 6886904, "step": 36130 }, { "epoch": 18.781185031185032, "grad_norm": 0.19215355813503265, "learning_rate": 5.639893666588986e-07, "loss": 0.1598, "num_input_tokens_seen": 6887864, "step": 36135 }, { "epoch": 18.783783783783782, "grad_norm": 0.7881821990013123, "learning_rate": 5.615969136232774e-07, "loss": 0.0805, "num_input_tokens_seen": 6888856, "step": 36140 }, { "epoch": 18.786382536382536, "grad_norm": 0.861548900604248, "learning_rate": 5.592094881495341e-07, "loss": 0.0694, "num_input_tokens_seen": 6889752, "step": 36145 }, { "epoch": 18.78898128898129, "grad_norm": 0.21344900131225586, "learning_rate": 5.568270907288287e-07, "loss": 0.2272, "num_input_tokens_seen": 6890648, "step": 36150 }, { "epoch": 18.79158004158004, "grad_norm": 1.1894899606704712, "learning_rate": 5.544497218512689e-07, "loss": 0.0811, "num_input_tokens_seen": 6891608, "step": 36155 }, { "epoch": 18.794178794178794, "grad_norm": 0.22809220850467682, "learning_rate": 5.520773820059361e-07, "loss": 0.1046, "num_input_tokens_seen": 6892536, "step": 36160 }, { "epoch": 18.796777546777548, "grad_norm": 0.4290755093097687, "learning_rate": 5.497100716808784e-07, "loss": 0.0546, "num_input_tokens_seen": 6893496, "step": 36165 }, { "epoch": 18.7993762993763, "grad_norm": 0.2539904713630676, "learning_rate": 5.47347791363112e-07, "loss": 0.0666, "num_input_tokens_seen": 6894424, "step": 36170 }, { "epoch": 18.801975051975052, "grad_norm": 0.39495763182640076, "learning_rate": 5.449905415386119e-07, "loss": 0.0909, "num_input_tokens_seen": 6895416, "step": 36175 }, { "epoch": 18.804573804573806, "grad_norm": 0.2141900360584259, "learning_rate": 5.426383226923154e-07, "loss": 0.1225, "num_input_tokens_seen": 6896376, "step": 36180 }, { "epoch": 18.807172557172557, "grad_norm": 0.49077507853507996, "learning_rate": 5.402911353081352e-07, "loss": 0.0506, "num_input_tokens_seen": 6897336, "step": 36185 }, { "epoch": 18.80977130977131, "grad_norm": 0.5611040592193604, "learning_rate": 5.379489798689435e-07, "loss": 0.1656, "num_input_tokens_seen": 6898328, "step": 36190 }, { "epoch": 18.81237006237006, "grad_norm": 0.11866690218448639, "learning_rate": 5.356118568565744e-07, "loss": 0.1036, "num_input_tokens_seen": 6899256, "step": 36195 }, { "epoch": 18.814968814968815, "grad_norm": 0.4156947433948517, "learning_rate": 5.332797667518346e-07, "loss": 0.1022, "num_input_tokens_seen": 6900216, "step": 36200 }, { "epoch": 18.81756756756757, "grad_norm": 0.18601395189762115, "learning_rate": 5.30952710034488e-07, "loss": 0.1031, "num_input_tokens_seen": 6901240, "step": 36205 }, { "epoch": 18.82016632016632, "grad_norm": 1.5244956016540527, "learning_rate": 5.286306871832653e-07, "loss": 0.1579, "num_input_tokens_seen": 6902232, "step": 36210 }, { "epoch": 18.822765072765073, "grad_norm": 0.9804543852806091, "learning_rate": 5.263136986758593e-07, "loss": 0.2262, "num_input_tokens_seen": 6903224, "step": 36215 }, { "epoch": 18.825363825363826, "grad_norm": 0.47281476855278015, "learning_rate": 5.240017449889361e-07, "loss": 0.1134, "num_input_tokens_seen": 6904120, "step": 36220 }, { "epoch": 18.827962577962577, "grad_norm": 0.9442428350448608, "learning_rate": 5.216948265981153e-07, "loss": 0.2843, "num_input_tokens_seen": 6905112, "step": 36225 }, { "epoch": 18.83056133056133, "grad_norm": 0.3716166317462921, "learning_rate": 5.193929439779865e-07, "loss": 0.0225, "num_input_tokens_seen": 6906072, "step": 36230 }, { "epoch": 18.833160083160084, "grad_norm": 0.8607296943664551, "learning_rate": 5.170960976020989e-07, "loss": 0.1872, "num_input_tokens_seen": 6907032, "step": 36235 }, { "epoch": 18.835758835758835, "grad_norm": 0.4074404239654541, "learning_rate": 5.148042879429771e-07, "loss": 0.1012, "num_input_tokens_seen": 6907992, "step": 36240 }, { "epoch": 18.83835758835759, "grad_norm": 0.34918275475502014, "learning_rate": 5.125175154720912e-07, "loss": 0.1512, "num_input_tokens_seen": 6908952, "step": 36245 }, { "epoch": 18.840956340956343, "grad_norm": 0.900185763835907, "learning_rate": 5.102357806598845e-07, "loss": 0.2003, "num_input_tokens_seen": 6909944, "step": 36250 }, { "epoch": 18.843555093555093, "grad_norm": 1.1399353742599487, "learning_rate": 5.079590839757731e-07, "loss": 0.1705, "num_input_tokens_seen": 6910840, "step": 36255 }, { "epoch": 18.846153846153847, "grad_norm": 1.2956115007400513, "learning_rate": 5.056874258881211e-07, "loss": 0.2345, "num_input_tokens_seen": 6911832, "step": 36260 }, { "epoch": 18.848752598752597, "grad_norm": 0.0891825333237648, "learning_rate": 5.034208068642632e-07, "loss": 0.1088, "num_input_tokens_seen": 6912760, "step": 36265 }, { "epoch": 18.85135135135135, "grad_norm": 0.6187438368797302, "learning_rate": 5.011592273704984e-07, "loss": 0.062, "num_input_tokens_seen": 6913720, "step": 36270 }, { "epoch": 18.853950103950105, "grad_norm": 0.6121053695678711, "learning_rate": 4.989026878720882e-07, "loss": 0.1214, "num_input_tokens_seen": 6914616, "step": 36275 }, { "epoch": 18.856548856548855, "grad_norm": 0.6794176697731018, "learning_rate": 4.966511888332553e-07, "loss": 0.0922, "num_input_tokens_seen": 6915576, "step": 36280 }, { "epoch": 18.85914760914761, "grad_norm": 1.5206366777420044, "learning_rate": 4.944047307171851e-07, "loss": 0.1142, "num_input_tokens_seen": 6916440, "step": 36285 }, { "epoch": 18.861746361746363, "grad_norm": 1.3310134410858154, "learning_rate": 4.921633139860299e-07, "loss": 0.2162, "num_input_tokens_seen": 6917400, "step": 36290 }, { "epoch": 18.864345114345113, "grad_norm": 0.4363921582698822, "learning_rate": 4.899269391009042e-07, "loss": 0.2078, "num_input_tokens_seen": 6918360, "step": 36295 }, { "epoch": 18.866943866943867, "grad_norm": 0.5579184889793396, "learning_rate": 4.876956065218791e-07, "loss": 0.1161, "num_input_tokens_seen": 6919352, "step": 36300 }, { "epoch": 18.86954261954262, "grad_norm": 0.8305104970932007, "learning_rate": 4.854693167079982e-07, "loss": 0.0523, "num_input_tokens_seen": 6920216, "step": 36305 }, { "epoch": 18.87214137214137, "grad_norm": 0.9663694500923157, "learning_rate": 4.832480701172564e-07, "loss": 0.1412, "num_input_tokens_seen": 6921240, "step": 36310 }, { "epoch": 18.874740124740125, "grad_norm": 0.6197991371154785, "learning_rate": 4.810318672066216e-07, "loss": 0.2642, "num_input_tokens_seen": 6922296, "step": 36315 }, { "epoch": 18.87733887733888, "grad_norm": 1.1975144147872925, "learning_rate": 4.788207084320178e-07, "loss": 0.1963, "num_input_tokens_seen": 6923192, "step": 36320 }, { "epoch": 18.87993762993763, "grad_norm": 0.23571592569351196, "learning_rate": 4.7661459424833134e-07, "loss": 0.1133, "num_input_tokens_seen": 6924216, "step": 36325 }, { "epoch": 18.882536382536383, "grad_norm": 1.908888339996338, "learning_rate": 4.744135251094184e-07, "loss": 0.1267, "num_input_tokens_seen": 6925144, "step": 36330 }, { "epoch": 18.885135135135137, "grad_norm": 0.11269267648458481, "learning_rate": 4.722175014680835e-07, "loss": 0.0569, "num_input_tokens_seen": 6926104, "step": 36335 }, { "epoch": 18.887733887733887, "grad_norm": 1.4573990106582642, "learning_rate": 4.7002652377610423e-07, "loss": 0.1308, "num_input_tokens_seen": 6927064, "step": 36340 }, { "epoch": 18.89033264033264, "grad_norm": 1.725288987159729, "learning_rate": 4.6784059248422276e-07, "loss": 0.155, "num_input_tokens_seen": 6927992, "step": 36345 }, { "epoch": 18.89293139293139, "grad_norm": 1.1775791645050049, "learning_rate": 4.656597080421293e-07, "loss": 0.1634, "num_input_tokens_seen": 6928952, "step": 36350 }, { "epoch": 18.895530145530145, "grad_norm": 0.8588786125183105, "learning_rate": 4.6348387089848456e-07, "loss": 0.1802, "num_input_tokens_seen": 6929912, "step": 36355 }, { "epoch": 18.8981288981289, "grad_norm": 0.26249560713768005, "learning_rate": 4.613130815009137e-07, "loss": 0.1171, "num_input_tokens_seen": 6930872, "step": 36360 }, { "epoch": 18.90072765072765, "grad_norm": 0.19639335572719574, "learning_rate": 4.591473402959984e-07, "loss": 0.1129, "num_input_tokens_seen": 6931768, "step": 36365 }, { "epoch": 18.903326403326403, "grad_norm": 0.2689746022224426, "learning_rate": 4.56986647729285e-07, "loss": 0.1081, "num_input_tokens_seen": 6932664, "step": 36370 }, { "epoch": 18.905925155925157, "grad_norm": 0.9414379596710205, "learning_rate": 4.548310042452736e-07, "loss": 0.1643, "num_input_tokens_seen": 6933592, "step": 36375 }, { "epoch": 18.908523908523907, "grad_norm": 0.48940008878707886, "learning_rate": 4.5268041028743714e-07, "loss": 0.1055, "num_input_tokens_seen": 6934520, "step": 36380 }, { "epoch": 18.91112266112266, "grad_norm": 0.05875932797789574, "learning_rate": 4.505348662982023e-07, "loss": 0.1443, "num_input_tokens_seen": 6935416, "step": 36385 }, { "epoch": 18.913721413721415, "grad_norm": 0.3727584183216095, "learning_rate": 4.483943727189577e-07, "loss": 0.088, "num_input_tokens_seen": 6936440, "step": 36390 }, { "epoch": 18.916320166320165, "grad_norm": 1.1663968563079834, "learning_rate": 4.4625892999005383e-07, "loss": 0.1386, "num_input_tokens_seen": 6937368, "step": 36395 }, { "epoch": 18.91891891891892, "grad_norm": 1.5851668119430542, "learning_rate": 4.4412853855080317e-07, "loss": 0.0768, "num_input_tokens_seen": 6938296, "step": 36400 }, { "epoch": 18.921517671517673, "grad_norm": 0.080940380692482, "learning_rate": 4.4200319883948013e-07, "loss": 0.089, "num_input_tokens_seen": 6939256, "step": 36405 }, { "epoch": 18.924116424116423, "grad_norm": 0.38849538564682007, "learning_rate": 4.398829112933128e-07, "loss": 0.1282, "num_input_tokens_seen": 6940216, "step": 36410 }, { "epoch": 18.926715176715177, "grad_norm": 0.13687212765216827, "learning_rate": 4.377676763484967e-07, "loss": 0.1177, "num_input_tokens_seen": 6941144, "step": 36415 }, { "epoch": 18.929313929313928, "grad_norm": 0.9924302101135254, "learning_rate": 4.356574944401892e-07, "loss": 0.2052, "num_input_tokens_seen": 6942104, "step": 36420 }, { "epoch": 18.93191268191268, "grad_norm": 1.010340929031372, "learning_rate": 4.335523660025043e-07, "loss": 0.1373, "num_input_tokens_seen": 6943032, "step": 36425 }, { "epoch": 18.934511434511435, "grad_norm": 0.6122754216194153, "learning_rate": 4.314522914685121e-07, "loss": 0.0567, "num_input_tokens_seen": 6944056, "step": 36430 }, { "epoch": 18.937110187110186, "grad_norm": 1.09135103225708, "learning_rate": 4.2935727127025314e-07, "loss": 0.1092, "num_input_tokens_seen": 6945016, "step": 36435 }, { "epoch": 18.93970893970894, "grad_norm": 0.3833441734313965, "learning_rate": 4.2726730583872165e-07, "loss": 0.1147, "num_input_tokens_seen": 6945944, "step": 36440 }, { "epoch": 18.942307692307693, "grad_norm": 0.9070149660110474, "learning_rate": 4.2518239560387086e-07, "loss": 0.0596, "num_input_tokens_seen": 6946904, "step": 36445 }, { "epoch": 18.944906444906444, "grad_norm": 0.15128962695598602, "learning_rate": 4.231025409946188e-07, "loss": 0.0631, "num_input_tokens_seen": 6947864, "step": 36450 }, { "epoch": 18.947505197505198, "grad_norm": 1.3448262214660645, "learning_rate": 4.210277424388426e-07, "loss": 0.2906, "num_input_tokens_seen": 6948792, "step": 36455 }, { "epoch": 18.95010395010395, "grad_norm": 0.41591575741767883, "learning_rate": 4.189580003633731e-07, "loss": 0.1581, "num_input_tokens_seen": 6949816, "step": 36460 }, { "epoch": 18.9527027027027, "grad_norm": 0.5668858885765076, "learning_rate": 4.1689331519400853e-07, "loss": 0.0708, "num_input_tokens_seen": 6950680, "step": 36465 }, { "epoch": 18.955301455301456, "grad_norm": 0.8503766059875488, "learning_rate": 4.1483368735550645e-07, "loss": 0.1491, "num_input_tokens_seen": 6951640, "step": 36470 }, { "epoch": 18.95790020790021, "grad_norm": 0.4050060510635376, "learning_rate": 4.127791172715778e-07, "loss": 0.1037, "num_input_tokens_seen": 6952600, "step": 36475 }, { "epoch": 18.96049896049896, "grad_norm": 0.19452472031116486, "learning_rate": 4.107296053648985e-07, "loss": 0.201, "num_input_tokens_seen": 6953592, "step": 36480 }, { "epoch": 18.963097713097714, "grad_norm": 0.3499811887741089, "learning_rate": 4.086851520570978e-07, "loss": 0.0782, "num_input_tokens_seen": 6954488, "step": 36485 }, { "epoch": 18.965696465696467, "grad_norm": 0.7549309134483337, "learning_rate": 4.066457577687727e-07, "loss": 0.1844, "num_input_tokens_seen": 6955448, "step": 36490 }, { "epoch": 18.968295218295218, "grad_norm": 0.758299708366394, "learning_rate": 4.046114229194764e-07, "loss": 0.1865, "num_input_tokens_seen": 6956440, "step": 36495 }, { "epoch": 18.97089397089397, "grad_norm": 0.49855250120162964, "learning_rate": 4.0258214792771585e-07, "loss": 0.0802, "num_input_tokens_seen": 6957336, "step": 36500 }, { "epoch": 18.973492723492722, "grad_norm": 0.39113926887512207, "learning_rate": 4.0055793321096266e-07, "loss": 0.1119, "num_input_tokens_seen": 6958328, "step": 36505 }, { "epoch": 18.976091476091476, "grad_norm": 0.3264562487602234, "learning_rate": 3.9853877918564764e-07, "loss": 0.0962, "num_input_tokens_seen": 6959384, "step": 36510 }, { "epoch": 18.97869022869023, "grad_norm": 0.1145494133234024, "learning_rate": 3.965246862671579e-07, "loss": 0.1405, "num_input_tokens_seen": 6960312, "step": 36515 }, { "epoch": 18.98128898128898, "grad_norm": 0.931904137134552, "learning_rate": 3.9451565486983976e-07, "loss": 0.0805, "num_input_tokens_seen": 6961304, "step": 36520 }, { "epoch": 18.983887733887734, "grad_norm": 0.3290283679962158, "learning_rate": 3.925116854069988e-07, "loss": 0.079, "num_input_tokens_seen": 6962360, "step": 36525 }, { "epoch": 18.986486486486488, "grad_norm": 0.04272269085049629, "learning_rate": 3.905127782909024e-07, "loss": 0.065, "num_input_tokens_seen": 6963256, "step": 36530 }, { "epoch": 18.989085239085238, "grad_norm": 0.13540132343769073, "learning_rate": 3.885189339327688e-07, "loss": 0.0723, "num_input_tokens_seen": 6964216, "step": 36535 }, { "epoch": 18.991683991683992, "grad_norm": 0.8987411856651306, "learning_rate": 3.8653015274278117e-07, "loss": 0.0551, "num_input_tokens_seen": 6965144, "step": 36540 }, { "epoch": 18.994282744282746, "grad_norm": 0.8653677105903625, "learning_rate": 3.845464351300787e-07, "loss": 0.1159, "num_input_tokens_seen": 6966104, "step": 36545 }, { "epoch": 18.996881496881496, "grad_norm": 0.4675668478012085, "learning_rate": 3.825677815027628e-07, "loss": 0.0774, "num_input_tokens_seen": 6967032, "step": 36550 }, { "epoch": 18.99948024948025, "grad_norm": 0.4199357330799103, "learning_rate": 3.8059419226788297e-07, "loss": 0.0517, "num_input_tokens_seen": 6967928, "step": 36555 }, { "epoch": 19.0, "eval_loss": 0.1469559222459793, "eval_runtime": 8.0296, "eval_samples_per_second": 106.605, "eval_steps_per_second": 26.651, "num_input_tokens_seen": 6968096, "step": 36556 }, { "epoch": 19.002079002079004, "grad_norm": 0.9815636277198792, "learning_rate": 3.7862566783146147e-07, "loss": 0.1782, "num_input_tokens_seen": 6968832, "step": 36560 }, { "epoch": 19.004677754677754, "grad_norm": 0.5956708788871765, "learning_rate": 3.766622085984661e-07, "loss": 0.0913, "num_input_tokens_seen": 6969760, "step": 36565 }, { "epoch": 19.007276507276508, "grad_norm": 0.7847219109535217, "learning_rate": 3.747038149728266e-07, "loss": 0.1051, "num_input_tokens_seen": 6970688, "step": 36570 }, { "epoch": 19.00987525987526, "grad_norm": 0.14983369410037994, "learning_rate": 3.7275048735743167e-07, "loss": 0.0362, "num_input_tokens_seen": 6971648, "step": 36575 }, { "epoch": 19.012474012474012, "grad_norm": 1.782934546470642, "learning_rate": 3.708022261541294e-07, "loss": 0.3581, "num_input_tokens_seen": 6972704, "step": 36580 }, { "epoch": 19.015072765072766, "grad_norm": 0.4699954390525818, "learning_rate": 3.688590317637214e-07, "loss": 0.2977, "num_input_tokens_seen": 6973696, "step": 36585 }, { "epoch": 19.017671517671516, "grad_norm": 0.20119696855545044, "learning_rate": 3.6692090458597115e-07, "loss": 0.0497, "num_input_tokens_seen": 6974624, "step": 36590 }, { "epoch": 19.02027027027027, "grad_norm": 0.9349967837333679, "learning_rate": 3.6498784501959595e-07, "loss": 0.1458, "num_input_tokens_seen": 6975552, "step": 36595 }, { "epoch": 19.022869022869024, "grad_norm": 0.47195935249328613, "learning_rate": 3.63059853462272e-07, "loss": 0.0932, "num_input_tokens_seen": 6976512, "step": 36600 }, { "epoch": 19.025467775467774, "grad_norm": 0.47085195779800415, "learning_rate": 3.611369303106349e-07, "loss": 0.0794, "num_input_tokens_seen": 6977536, "step": 36605 }, { "epoch": 19.028066528066528, "grad_norm": 0.8989771604537964, "learning_rate": 3.592190759602709e-07, "loss": 0.1062, "num_input_tokens_seen": 6978528, "step": 36610 }, { "epoch": 19.030665280665282, "grad_norm": 0.47055739164352417, "learning_rate": 3.5730629080573676e-07, "loss": 0.0892, "num_input_tokens_seen": 6979488, "step": 36615 }, { "epoch": 19.033264033264032, "grad_norm": 0.2473747283220291, "learning_rate": 3.5539857524053145e-07, "loss": 0.1147, "num_input_tokens_seen": 6980512, "step": 36620 }, { "epoch": 19.035862785862786, "grad_norm": 0.7540170550346375, "learning_rate": 3.534959296571189e-07, "loss": 0.1604, "num_input_tokens_seen": 6981472, "step": 36625 }, { "epoch": 19.03846153846154, "grad_norm": 0.3396070599555969, "learning_rate": 3.5159835444691926e-07, "loss": 0.0576, "num_input_tokens_seen": 6982400, "step": 36630 }, { "epoch": 19.04106029106029, "grad_norm": 0.850355863571167, "learning_rate": 3.497058500003092e-07, "loss": 0.2829, "num_input_tokens_seen": 6983456, "step": 36635 }, { "epoch": 19.043659043659044, "grad_norm": 0.13806478679180145, "learning_rate": 3.4781841670662175e-07, "loss": 0.1148, "num_input_tokens_seen": 6984416, "step": 36640 }, { "epoch": 19.046257796257795, "grad_norm": 1.2022554874420166, "learning_rate": 3.4593605495414626e-07, "loss": 0.1326, "num_input_tokens_seen": 6985472, "step": 36645 }, { "epoch": 19.04885654885655, "grad_norm": 0.0760929137468338, "learning_rate": 3.440587651301286e-07, "loss": 0.1154, "num_input_tokens_seen": 6986400, "step": 36650 }, { "epoch": 19.051455301455302, "grad_norm": 0.12776526808738708, "learning_rate": 3.421865476207764e-07, "loss": 0.1229, "num_input_tokens_seen": 6987296, "step": 36655 }, { "epoch": 19.054054054054053, "grad_norm": 0.8132994174957275, "learning_rate": 3.403194028112455e-07, "loss": 0.133, "num_input_tokens_seen": 6988224, "step": 36660 }, { "epoch": 19.056652806652806, "grad_norm": 0.9174444675445557, "learning_rate": 3.3845733108564814e-07, "loss": 0.0856, "num_input_tokens_seen": 6989120, "step": 36665 }, { "epoch": 19.05925155925156, "grad_norm": 0.2201354056596756, "learning_rate": 3.366003328270667e-07, "loss": 0.0569, "num_input_tokens_seen": 6990048, "step": 36670 }, { "epoch": 19.06185031185031, "grad_norm": 1.6277072429656982, "learning_rate": 3.347484084175234e-07, "loss": 0.1763, "num_input_tokens_seen": 6991040, "step": 36675 }, { "epoch": 19.064449064449065, "grad_norm": 0.44416099786758423, "learning_rate": 3.3290155823800515e-07, "loss": 0.0938, "num_input_tokens_seen": 6991968, "step": 36680 }, { "epoch": 19.06704781704782, "grad_norm": 0.7984893321990967, "learning_rate": 3.310597826684525e-07, "loss": 0.0474, "num_input_tokens_seen": 6992864, "step": 36685 }, { "epoch": 19.06964656964657, "grad_norm": 0.3716520369052887, "learning_rate": 3.292230820877623e-07, "loss": 0.1205, "num_input_tokens_seen": 6993856, "step": 36690 }, { "epoch": 19.072245322245323, "grad_norm": 0.23526866734027863, "learning_rate": 3.273914568737879e-07, "loss": 0.1137, "num_input_tokens_seen": 6994816, "step": 36695 }, { "epoch": 19.074844074844076, "grad_norm": 0.27197009325027466, "learning_rate": 3.25564907403339e-07, "loss": 0.0563, "num_input_tokens_seen": 6995872, "step": 36700 }, { "epoch": 19.077442827442827, "grad_norm": 1.0538047552108765, "learning_rate": 3.237434340521789e-07, "loss": 0.1539, "num_input_tokens_seen": 6996800, "step": 36705 }, { "epoch": 19.08004158004158, "grad_norm": 0.41450926661491394, "learning_rate": 3.2192703719503005e-07, "loss": 0.2104, "num_input_tokens_seen": 6997792, "step": 36710 }, { "epoch": 19.08264033264033, "grad_norm": 0.09852100908756256, "learning_rate": 3.2011571720556575e-07, "loss": 0.046, "num_input_tokens_seen": 6998688, "step": 36715 }, { "epoch": 19.085239085239085, "grad_norm": 0.7251068353652954, "learning_rate": 3.183094744564213e-07, "loss": 0.1702, "num_input_tokens_seen": 6999680, "step": 36720 }, { "epoch": 19.08783783783784, "grad_norm": 0.08913570642471313, "learning_rate": 3.165083093191801e-07, "loss": 0.0578, "num_input_tokens_seen": 7000640, "step": 36725 }, { "epoch": 19.09043659043659, "grad_norm": 0.36795830726623535, "learning_rate": 3.1471222216438723e-07, "loss": 0.028, "num_input_tokens_seen": 7001568, "step": 36730 }, { "epoch": 19.093035343035343, "grad_norm": 0.5117580890655518, "learning_rate": 3.129212133615389e-07, "loss": 0.1083, "num_input_tokens_seen": 7002496, "step": 36735 }, { "epoch": 19.095634095634097, "grad_norm": 0.5490829944610596, "learning_rate": 3.111352832790904e-07, "loss": 0.114, "num_input_tokens_seen": 7003456, "step": 36740 }, { "epoch": 19.098232848232847, "grad_norm": 0.7229372262954712, "learning_rate": 3.093544322844505e-07, "loss": 0.1547, "num_input_tokens_seen": 7004416, "step": 36745 }, { "epoch": 19.1008316008316, "grad_norm": 0.950105607509613, "learning_rate": 3.07578660743979e-07, "loss": 0.0858, "num_input_tokens_seen": 7005440, "step": 36750 }, { "epoch": 19.103430353430355, "grad_norm": 0.906943142414093, "learning_rate": 3.058079690229948e-07, "loss": 0.0649, "num_input_tokens_seen": 7006336, "step": 36755 }, { "epoch": 19.106029106029105, "grad_norm": 0.18486037850379944, "learning_rate": 3.040423574857759e-07, "loss": 0.0862, "num_input_tokens_seen": 7007264, "step": 36760 }, { "epoch": 19.10862785862786, "grad_norm": 0.18655067682266235, "learning_rate": 3.0228182649555126e-07, "loss": 0.069, "num_input_tokens_seen": 7008224, "step": 36765 }, { "epoch": 19.111226611226613, "grad_norm": 0.1112365797162056, "learning_rate": 3.00526376414495e-07, "loss": 0.1026, "num_input_tokens_seen": 7009152, "step": 36770 }, { "epoch": 19.113825363825363, "grad_norm": 0.7062147855758667, "learning_rate": 2.9877600760375434e-07, "loss": 0.1017, "num_input_tokens_seen": 7010112, "step": 36775 }, { "epoch": 19.116424116424117, "grad_norm": 1.0459240674972534, "learning_rate": 2.970307204234163e-07, "loss": 0.1637, "num_input_tokens_seen": 7011040, "step": 36780 }, { "epoch": 19.11902286902287, "grad_norm": 0.13976918160915375, "learning_rate": 2.9529051523253247e-07, "loss": 0.0493, "num_input_tokens_seen": 7011936, "step": 36785 }, { "epoch": 19.12162162162162, "grad_norm": 0.26179683208465576, "learning_rate": 2.935553923890999e-07, "loss": 0.0768, "num_input_tokens_seen": 7012928, "step": 36790 }, { "epoch": 19.124220374220375, "grad_norm": 1.1315745115280151, "learning_rate": 2.9182535225007744e-07, "loss": 0.1658, "num_input_tokens_seen": 7013888, "step": 36795 }, { "epoch": 19.126819126819125, "grad_norm": 0.5056440234184265, "learning_rate": 2.9010039517137487e-07, "loss": 0.0857, "num_input_tokens_seen": 7014880, "step": 36800 }, { "epoch": 19.12941787941788, "grad_norm": 1.08942449092865, "learning_rate": 2.883805215078583e-07, "loss": 0.151, "num_input_tokens_seen": 7015872, "step": 36805 }, { "epoch": 19.132016632016633, "grad_norm": 0.6972222328186035, "learning_rate": 2.8666573161334475e-07, "loss": 0.1465, "num_input_tokens_seen": 7016832, "step": 36810 }, { "epoch": 19.134615384615383, "grad_norm": 0.1352255791425705, "learning_rate": 2.849560258406075e-07, "loss": 0.0488, "num_input_tokens_seen": 7017792, "step": 36815 }, { "epoch": 19.137214137214137, "grad_norm": 1.7643566131591797, "learning_rate": 2.8325140454137646e-07, "loss": 0.1574, "num_input_tokens_seen": 7018816, "step": 36820 }, { "epoch": 19.13981288981289, "grad_norm": 0.8528592586517334, "learning_rate": 2.815518680663293e-07, "loss": 0.0932, "num_input_tokens_seen": 7019712, "step": 36825 }, { "epoch": 19.14241164241164, "grad_norm": 0.4560539722442627, "learning_rate": 2.798574167651058e-07, "loss": 0.0936, "num_input_tokens_seen": 7020640, "step": 36830 }, { "epoch": 19.145010395010395, "grad_norm": 0.995337188243866, "learning_rate": 2.7816805098628826e-07, "loss": 0.2183, "num_input_tokens_seen": 7021632, "step": 36835 }, { "epoch": 19.14760914760915, "grad_norm": 0.11685173958539963, "learning_rate": 2.7648377107742375e-07, "loss": 0.122, "num_input_tokens_seen": 7022688, "step": 36840 }, { "epoch": 19.1502079002079, "grad_norm": 1.2147223949432373, "learning_rate": 2.748045773850072e-07, "loss": 0.1587, "num_input_tokens_seen": 7023648, "step": 36845 }, { "epoch": 19.152806652806653, "grad_norm": 0.30288413166999817, "learning_rate": 2.731304702544929e-07, "loss": 0.0872, "num_input_tokens_seen": 7024576, "step": 36850 }, { "epoch": 19.155405405405407, "grad_norm": 0.9247974753379822, "learning_rate": 2.7146145003028036e-07, "loss": 0.204, "num_input_tokens_seen": 7025536, "step": 36855 }, { "epoch": 19.158004158004157, "grad_norm": 0.2621215581893921, "learning_rate": 2.6979751705572544e-07, "loss": 0.0779, "num_input_tokens_seen": 7026496, "step": 36860 }, { "epoch": 19.16060291060291, "grad_norm": 0.21823804080486298, "learning_rate": 2.6813867167314324e-07, "loss": 0.1728, "num_input_tokens_seen": 7027456, "step": 36865 }, { "epoch": 19.16320166320166, "grad_norm": 1.3473023176193237, "learning_rate": 2.664849142237968e-07, "loss": 0.1078, "num_input_tokens_seen": 7028320, "step": 36870 }, { "epoch": 19.165800415800415, "grad_norm": 1.0368139743804932, "learning_rate": 2.6483624504790026e-07, "loss": 0.3906, "num_input_tokens_seen": 7029280, "step": 36875 }, { "epoch": 19.16839916839917, "grad_norm": 1.071135401725769, "learning_rate": 2.6319266448462666e-07, "loss": 0.1687, "num_input_tokens_seen": 7030304, "step": 36880 }, { "epoch": 19.17099792099792, "grad_norm": 0.18166576325893402, "learning_rate": 2.6155417287209994e-07, "loss": 0.1239, "num_input_tokens_seen": 7031200, "step": 36885 }, { "epoch": 19.173596673596673, "grad_norm": 0.1632104068994522, "learning_rate": 2.5992077054739774e-07, "loss": 0.195, "num_input_tokens_seen": 7032192, "step": 36890 }, { "epoch": 19.176195426195427, "grad_norm": 0.8149278163909912, "learning_rate": 2.5829245784654843e-07, "loss": 0.1017, "num_input_tokens_seen": 7033088, "step": 36895 }, { "epoch": 19.178794178794178, "grad_norm": 0.9316343069076538, "learning_rate": 2.566692351045341e-07, "loss": 0.114, "num_input_tokens_seen": 7034144, "step": 36900 }, { "epoch": 19.18139293139293, "grad_norm": 0.8964475989341736, "learning_rate": 2.5505110265529044e-07, "loss": 0.165, "num_input_tokens_seen": 7035136, "step": 36905 }, { "epoch": 19.183991683991685, "grad_norm": 0.3759427070617676, "learning_rate": 2.534380608317066e-07, "loss": 0.0703, "num_input_tokens_seen": 7036128, "step": 36910 }, { "epoch": 19.186590436590436, "grad_norm": 1.6223303079605103, "learning_rate": 2.5183010996562007e-07, "loss": 0.1035, "num_input_tokens_seen": 7037088, "step": 36915 }, { "epoch": 19.18918918918919, "grad_norm": 0.1715640276670456, "learning_rate": 2.502272503878328e-07, "loss": 0.1722, "num_input_tokens_seen": 7037984, "step": 36920 }, { "epoch": 19.191787941787943, "grad_norm": 0.14006170630455017, "learning_rate": 2.4862948242808383e-07, "loss": 0.1954, "num_input_tokens_seen": 7038944, "step": 36925 }, { "epoch": 19.194386694386694, "grad_norm": 0.33696258068084717, "learning_rate": 2.4703680641507407e-07, "loss": 0.2661, "num_input_tokens_seen": 7039872, "step": 36930 }, { "epoch": 19.196985446985448, "grad_norm": 1.0268722772598267, "learning_rate": 2.454492226764582e-07, "loss": 0.1487, "num_input_tokens_seen": 7040768, "step": 36935 }, { "epoch": 19.1995841995842, "grad_norm": 1.2060465812683105, "learning_rate": 2.43866731538836e-07, "loss": 0.1226, "num_input_tokens_seen": 7041760, "step": 36940 }, { "epoch": 19.20218295218295, "grad_norm": 1.0459314584732056, "learning_rate": 2.422893333277665e-07, "loss": 0.1808, "num_input_tokens_seen": 7042752, "step": 36945 }, { "epoch": 19.204781704781706, "grad_norm": 0.2872167229652405, "learning_rate": 2.40717028367754e-07, "loss": 0.1086, "num_input_tokens_seen": 7043680, "step": 36950 }, { "epoch": 19.207380457380456, "grad_norm": 0.23723742365837097, "learning_rate": 2.39149816982262e-07, "loss": 0.1171, "num_input_tokens_seen": 7044704, "step": 36955 }, { "epoch": 19.20997920997921, "grad_norm": 0.19532713294029236, "learning_rate": 2.3758769949370762e-07, "loss": 0.2331, "num_input_tokens_seen": 7045600, "step": 36960 }, { "epoch": 19.212577962577964, "grad_norm": 1.0218663215637207, "learning_rate": 2.3603067622344488e-07, "loss": 0.1054, "num_input_tokens_seen": 7046592, "step": 36965 }, { "epoch": 19.215176715176714, "grad_norm": 1.0172232389450073, "learning_rate": 2.3447874749180088e-07, "loss": 0.1332, "num_input_tokens_seen": 7047552, "step": 36970 }, { "epoch": 19.217775467775468, "grad_norm": 0.23045696318149567, "learning_rate": 2.3293191361803967e-07, "loss": 0.1319, "num_input_tokens_seen": 7048480, "step": 36975 }, { "epoch": 19.22037422037422, "grad_norm": 1.4300086498260498, "learning_rate": 2.3139017492038172e-07, "loss": 0.0791, "num_input_tokens_seen": 7049408, "step": 36980 }, { "epoch": 19.222972972972972, "grad_norm": 0.6845518946647644, "learning_rate": 2.2985353171599833e-07, "loss": 0.0711, "num_input_tokens_seen": 7050336, "step": 36985 }, { "epoch": 19.225571725571726, "grad_norm": 0.13268712162971497, "learning_rate": 2.2832198432101714e-07, "loss": 0.1553, "num_input_tokens_seen": 7051360, "step": 36990 }, { "epoch": 19.22817047817048, "grad_norm": 0.936820387840271, "learning_rate": 2.2679553305051116e-07, "loss": 0.1143, "num_input_tokens_seen": 7052288, "step": 36995 }, { "epoch": 19.23076923076923, "grad_norm": 0.08637123554944992, "learning_rate": 2.2527417821850972e-07, "loss": 0.1458, "num_input_tokens_seen": 7053248, "step": 37000 }, { "epoch": 19.233367983367984, "grad_norm": 0.07827615737915039, "learning_rate": 2.2375792013798748e-07, "loss": 0.0819, "num_input_tokens_seen": 7054240, "step": 37005 }, { "epoch": 19.235966735966738, "grad_norm": 0.3200732171535492, "learning_rate": 2.22246759120881e-07, "loss": 0.0846, "num_input_tokens_seen": 7055328, "step": 37010 }, { "epoch": 19.238565488565488, "grad_norm": 0.8006222248077393, "learning_rate": 2.207406954780694e-07, "loss": 0.1237, "num_input_tokens_seen": 7056288, "step": 37015 }, { "epoch": 19.241164241164242, "grad_norm": 0.40678054094314575, "learning_rate": 2.1923972951938543e-07, "loss": 0.1336, "num_input_tokens_seen": 7057280, "step": 37020 }, { "epoch": 19.243762993762992, "grad_norm": 0.5582401752471924, "learning_rate": 2.1774386155361538e-07, "loss": 0.0573, "num_input_tokens_seen": 7058240, "step": 37025 }, { "epoch": 19.246361746361746, "grad_norm": 0.15890063345432281, "learning_rate": 2.1625309188849363e-07, "loss": 0.1071, "num_input_tokens_seen": 7059264, "step": 37030 }, { "epoch": 19.2489604989605, "grad_norm": 0.12325280159711838, "learning_rate": 2.1476742083070546e-07, "loss": 0.0936, "num_input_tokens_seen": 7060224, "step": 37035 }, { "epoch": 19.25155925155925, "grad_norm": 1.2287161350250244, "learning_rate": 2.132868486858869e-07, "loss": 0.1143, "num_input_tokens_seen": 7061152, "step": 37040 }, { "epoch": 19.254158004158004, "grad_norm": 0.721959114074707, "learning_rate": 2.118113757586332e-07, "loss": 0.1498, "num_input_tokens_seen": 7062112, "step": 37045 }, { "epoch": 19.256756756756758, "grad_norm": 0.12339422106742859, "learning_rate": 2.1034100235248212e-07, "loss": 0.0238, "num_input_tokens_seen": 7063040, "step": 37050 }, { "epoch": 19.259355509355508, "grad_norm": 0.33800187706947327, "learning_rate": 2.0887572876992223e-07, "loss": 0.1291, "num_input_tokens_seen": 7063968, "step": 37055 }, { "epoch": 19.261954261954262, "grad_norm": 0.387393057346344, "learning_rate": 2.0741555531239855e-07, "loss": 0.087, "num_input_tokens_seen": 7064992, "step": 37060 }, { "epoch": 19.264553014553016, "grad_norm": 0.5896921157836914, "learning_rate": 2.0596048228030128e-07, "loss": 0.1066, "num_input_tokens_seen": 7065984, "step": 37065 }, { "epoch": 19.267151767151766, "grad_norm": 0.43825769424438477, "learning_rate": 2.0451050997297162e-07, "loss": 0.2216, "num_input_tokens_seen": 7067008, "step": 37070 }, { "epoch": 19.26975051975052, "grad_norm": 0.8193973898887634, "learning_rate": 2.03065638688707e-07, "loss": 0.1244, "num_input_tokens_seen": 7068032, "step": 37075 }, { "epoch": 19.272349272349274, "grad_norm": 1.0983854532241821, "learning_rate": 2.016258687247502e-07, "loss": 0.0852, "num_input_tokens_seen": 7068960, "step": 37080 }, { "epoch": 19.274948024948024, "grad_norm": 0.17212443053722382, "learning_rate": 2.0019120037730043e-07, "loss": 0.0781, "num_input_tokens_seen": 7069856, "step": 37085 }, { "epoch": 19.277546777546778, "grad_norm": 0.1129128709435463, "learning_rate": 1.987616339414966e-07, "loss": 0.1291, "num_input_tokens_seen": 7070816, "step": 37090 }, { "epoch": 19.28014553014553, "grad_norm": 1.6581480503082275, "learning_rate": 1.9733716971143678e-07, "loss": 0.1157, "num_input_tokens_seen": 7071744, "step": 37095 }, { "epoch": 19.282744282744282, "grad_norm": 2.1332571506500244, "learning_rate": 1.9591780798016989e-07, "loss": 0.1562, "num_input_tokens_seen": 7072736, "step": 37100 }, { "epoch": 19.285343035343036, "grad_norm": 0.8415141105651855, "learning_rate": 1.9450354903968737e-07, "loss": 0.0848, "num_input_tokens_seen": 7073600, "step": 37105 }, { "epoch": 19.287941787941786, "grad_norm": 0.43894389271736145, "learning_rate": 1.930943931809398e-07, "loss": 0.1953, "num_input_tokens_seen": 7074528, "step": 37110 }, { "epoch": 19.29054054054054, "grad_norm": 0.8547520637512207, "learning_rate": 1.9169034069382584e-07, "loss": 0.0734, "num_input_tokens_seen": 7075488, "step": 37115 }, { "epoch": 19.293139293139294, "grad_norm": 1.303713083267212, "learning_rate": 1.9029139186718948e-07, "loss": 0.1451, "num_input_tokens_seen": 7076448, "step": 37120 }, { "epoch": 19.295738045738045, "grad_norm": 0.301475465297699, "learning_rate": 1.8889754698882822e-07, "loss": 0.187, "num_input_tokens_seen": 7077344, "step": 37125 }, { "epoch": 19.2983367983368, "grad_norm": 0.7117434144020081, "learning_rate": 1.8750880634548772e-07, "loss": 0.1052, "num_input_tokens_seen": 7078272, "step": 37130 }, { "epoch": 19.300935550935552, "grad_norm": 0.08972958475351334, "learning_rate": 1.8612517022286725e-07, "loss": 0.074, "num_input_tokens_seen": 7079168, "step": 37135 }, { "epoch": 19.303534303534303, "grad_norm": 0.24843961000442505, "learning_rate": 1.8474663890561405e-07, "loss": 0.1099, "num_input_tokens_seen": 7080128, "step": 37140 }, { "epoch": 19.306133056133056, "grad_norm": 1.7534433603286743, "learning_rate": 1.8337321267732356e-07, "loss": 0.2409, "num_input_tokens_seen": 7081120, "step": 37145 }, { "epoch": 19.30873180873181, "grad_norm": 0.3007986843585968, "learning_rate": 1.8200489182054193e-07, "loss": 0.2103, "num_input_tokens_seen": 7082080, "step": 37150 }, { "epoch": 19.31133056133056, "grad_norm": 0.15292133390903473, "learning_rate": 1.80641676616769e-07, "loss": 0.059, "num_input_tokens_seen": 7083104, "step": 37155 }, { "epoch": 19.313929313929314, "grad_norm": 1.2464463710784912, "learning_rate": 1.7928356734644436e-07, "loss": 0.0991, "num_input_tokens_seen": 7084096, "step": 37160 }, { "epoch": 19.316528066528065, "grad_norm": 0.12305931001901627, "learning_rate": 1.779305642889667e-07, "loss": 0.1022, "num_input_tokens_seen": 7085120, "step": 37165 }, { "epoch": 19.31912681912682, "grad_norm": 0.4875238835811615, "learning_rate": 1.7658266772268284e-07, "loss": 0.1285, "num_input_tokens_seen": 7086112, "step": 37170 }, { "epoch": 19.321725571725572, "grad_norm": 2.541036605834961, "learning_rate": 1.7523987792488483e-07, "loss": 0.1901, "num_input_tokens_seen": 7087040, "step": 37175 }, { "epoch": 19.324324324324323, "grad_norm": 1.0010212659835815, "learning_rate": 1.739021951718184e-07, "loss": 0.1788, "num_input_tokens_seen": 7088064, "step": 37180 }, { "epoch": 19.326923076923077, "grad_norm": 0.2534255385398865, "learning_rate": 1.7256961973867447e-07, "loss": 0.2472, "num_input_tokens_seen": 7088960, "step": 37185 }, { "epoch": 19.32952182952183, "grad_norm": 1.0465352535247803, "learning_rate": 1.7124215189959768e-07, "loss": 0.1178, "num_input_tokens_seen": 7089888, "step": 37190 }, { "epoch": 19.33212058212058, "grad_norm": 0.2354370504617691, "learning_rate": 1.6991979192767793e-07, "loss": 0.0835, "num_input_tokens_seen": 7090880, "step": 37195 }, { "epoch": 19.334719334719335, "grad_norm": 0.43779370188713074, "learning_rate": 1.686025400949559e-07, "loss": 0.1826, "num_input_tokens_seen": 7091872, "step": 37200 }, { "epoch": 19.33731808731809, "grad_norm": 0.33300286531448364, "learning_rate": 1.6729039667242596e-07, "loss": 0.0383, "num_input_tokens_seen": 7092768, "step": 37205 }, { "epoch": 19.33991683991684, "grad_norm": 1.4834423065185547, "learning_rate": 1.6598336193002216e-07, "loss": 0.0807, "num_input_tokens_seen": 7093728, "step": 37210 }, { "epoch": 19.342515592515593, "grad_norm": 1.1275255680084229, "learning_rate": 1.64681436136635e-07, "loss": 0.1097, "num_input_tokens_seen": 7094656, "step": 37215 }, { "epoch": 19.345114345114347, "grad_norm": 0.4782935082912445, "learning_rate": 1.6338461956010022e-07, "loss": 0.1048, "num_input_tokens_seen": 7095616, "step": 37220 }, { "epoch": 19.347713097713097, "grad_norm": 0.8568390607833862, "learning_rate": 1.6209291246720714e-07, "loss": 0.0584, "num_input_tokens_seen": 7096512, "step": 37225 }, { "epoch": 19.35031185031185, "grad_norm": 0.9684969782829285, "learning_rate": 1.6080631512368772e-07, "loss": 0.1753, "num_input_tokens_seen": 7097440, "step": 37230 }, { "epoch": 19.352910602910605, "grad_norm": 1.4478659629821777, "learning_rate": 1.5952482779422463e-07, "loss": 0.0747, "num_input_tokens_seen": 7098464, "step": 37235 }, { "epoch": 19.355509355509355, "grad_norm": 2.3251190185546875, "learning_rate": 1.582484507424542e-07, "loss": 0.1681, "num_input_tokens_seen": 7099456, "step": 37240 }, { "epoch": 19.35810810810811, "grad_norm": 1.2969690561294556, "learning_rate": 1.569771842309553e-07, "loss": 0.1771, "num_input_tokens_seen": 7100384, "step": 37245 }, { "epoch": 19.36070686070686, "grad_norm": 0.24907122552394867, "learning_rate": 1.5571102852125762e-07, "loss": 0.1031, "num_input_tokens_seen": 7101408, "step": 37250 }, { "epoch": 19.363305613305613, "grad_norm": 0.11288633942604065, "learning_rate": 1.5444998387383892e-07, "loss": 0.1251, "num_input_tokens_seen": 7102336, "step": 37255 }, { "epoch": 19.365904365904367, "grad_norm": 1.1853588819503784, "learning_rate": 1.5319405054813052e-07, "loss": 0.1147, "num_input_tokens_seen": 7103360, "step": 37260 }, { "epoch": 19.368503118503117, "grad_norm": 0.302783340215683, "learning_rate": 1.5194322880250356e-07, "loss": 0.0945, "num_input_tokens_seen": 7104224, "step": 37265 }, { "epoch": 19.37110187110187, "grad_norm": 0.8277984261512756, "learning_rate": 1.5069751889428274e-07, "loss": 0.139, "num_input_tokens_seen": 7105152, "step": 37270 }, { "epoch": 19.373700623700625, "grad_norm": 0.0988534688949585, "learning_rate": 1.4945692107973808e-07, "loss": 0.0815, "num_input_tokens_seen": 7106208, "step": 37275 }, { "epoch": 19.376299376299375, "grad_norm": 1.0884679555892944, "learning_rate": 1.4822143561409317e-07, "loss": 0.124, "num_input_tokens_seen": 7107136, "step": 37280 }, { "epoch": 19.37889812889813, "grad_norm": 1.401443600654602, "learning_rate": 1.4699106275151975e-07, "loss": 0.1934, "num_input_tokens_seen": 7108128, "step": 37285 }, { "epoch": 19.381496881496883, "grad_norm": 0.6504327058792114, "learning_rate": 1.4576580274512642e-07, "loss": 0.2367, "num_input_tokens_seen": 7109088, "step": 37290 }, { "epoch": 19.384095634095633, "grad_norm": 0.6547403335571289, "learning_rate": 1.4454565584698654e-07, "loss": 0.0762, "num_input_tokens_seen": 7109984, "step": 37295 }, { "epoch": 19.386694386694387, "grad_norm": 0.7305966019630432, "learning_rate": 1.4333062230810769e-07, "loss": 0.1094, "num_input_tokens_seen": 7110912, "step": 37300 }, { "epoch": 19.38929313929314, "grad_norm": 0.28541067242622375, "learning_rate": 1.4212070237845378e-07, "loss": 0.1125, "num_input_tokens_seen": 7111872, "step": 37305 }, { "epoch": 19.39189189189189, "grad_norm": 0.1740744262933731, "learning_rate": 1.409158963069368e-07, "loss": 0.0794, "num_input_tokens_seen": 7112832, "step": 37310 }, { "epoch": 19.394490644490645, "grad_norm": 0.2782192826271057, "learning_rate": 1.3971620434140852e-07, "loss": 0.1149, "num_input_tokens_seen": 7113728, "step": 37315 }, { "epoch": 19.397089397089395, "grad_norm": 0.3747123181819916, "learning_rate": 1.3852162672867708e-07, "loss": 0.0851, "num_input_tokens_seen": 7114656, "step": 37320 }, { "epoch": 19.39968814968815, "grad_norm": 0.2754388153553009, "learning_rate": 1.3733216371449586e-07, "loss": 0.113, "num_input_tokens_seen": 7115616, "step": 37325 }, { "epoch": 19.402286902286903, "grad_norm": 0.6667495369911194, "learning_rate": 1.361478155435636e-07, "loss": 0.2061, "num_input_tokens_seen": 7116640, "step": 37330 }, { "epoch": 19.404885654885653, "grad_norm": 0.8725494146347046, "learning_rate": 1.349685824595326e-07, "loss": 0.0904, "num_input_tokens_seen": 7117600, "step": 37335 }, { "epoch": 19.407484407484407, "grad_norm": 1.4536422491073608, "learning_rate": 1.3379446470499767e-07, "loss": 0.1449, "num_input_tokens_seen": 7118624, "step": 37340 }, { "epoch": 19.41008316008316, "grad_norm": 0.16091130673885345, "learning_rate": 1.32625462521499e-07, "loss": 0.1969, "num_input_tokens_seen": 7119520, "step": 37345 }, { "epoch": 19.41268191268191, "grad_norm": 0.7183749079704285, "learning_rate": 1.314615761495358e-07, "loss": 0.1411, "num_input_tokens_seen": 7120480, "step": 37350 }, { "epoch": 19.415280665280665, "grad_norm": 0.5190526247024536, "learning_rate": 1.303028058285388e-07, "loss": 0.188, "num_input_tokens_seen": 7121440, "step": 37355 }, { "epoch": 19.41787941787942, "grad_norm": 0.08736097067594528, "learning_rate": 1.2914915179690067e-07, "loss": 0.1467, "num_input_tokens_seen": 7122336, "step": 37360 }, { "epoch": 19.42047817047817, "grad_norm": 1.5236475467681885, "learning_rate": 1.2800061429195377e-07, "loss": 0.3333, "num_input_tokens_seen": 7123264, "step": 37365 }, { "epoch": 19.423076923076923, "grad_norm": 0.6066229939460754, "learning_rate": 1.268571935499785e-07, "loss": 0.0825, "num_input_tokens_seen": 7124224, "step": 37370 }, { "epoch": 19.425675675675677, "grad_norm": 1.0799739360809326, "learning_rate": 1.2571888980620615e-07, "loss": 0.1454, "num_input_tokens_seen": 7125120, "step": 37375 }, { "epoch": 19.428274428274428, "grad_norm": 0.9868587851524353, "learning_rate": 1.2458570329481057e-07, "loss": 0.1334, "num_input_tokens_seen": 7126048, "step": 37380 }, { "epoch": 19.43087318087318, "grad_norm": 2.0660412311553955, "learning_rate": 1.2345763424891632e-07, "loss": 0.2864, "num_input_tokens_seen": 7127040, "step": 37385 }, { "epoch": 19.433471933471935, "grad_norm": 2.437065362930298, "learning_rate": 1.2233468290059614e-07, "loss": 0.1467, "num_input_tokens_seen": 7128064, "step": 37390 }, { "epoch": 19.436070686070686, "grad_norm": 0.2623727023601532, "learning_rate": 1.2121684948086243e-07, "loss": 0.0583, "num_input_tokens_seen": 7129024, "step": 37395 }, { "epoch": 19.43866943866944, "grad_norm": 0.27023667097091675, "learning_rate": 1.2010413421968404e-07, "loss": 0.1799, "num_input_tokens_seen": 7129952, "step": 37400 }, { "epoch": 19.44126819126819, "grad_norm": 1.5628875494003296, "learning_rate": 1.1899653734597505e-07, "loss": 0.1727, "num_input_tokens_seen": 7130880, "step": 37405 }, { "epoch": 19.443866943866944, "grad_norm": 0.14102822542190552, "learning_rate": 1.1789405908758932e-07, "loss": 0.0662, "num_input_tokens_seen": 7131744, "step": 37410 }, { "epoch": 19.446465696465697, "grad_norm": 0.677739143371582, "learning_rate": 1.1679669967133434e-07, "loss": 0.0779, "num_input_tokens_seen": 7132736, "step": 37415 }, { "epoch": 19.449064449064448, "grad_norm": 0.7397706508636475, "learning_rate": 1.1570445932296837e-07, "loss": 0.0444, "num_input_tokens_seen": 7133600, "step": 37420 }, { "epoch": 19.4516632016632, "grad_norm": 0.8937792181968689, "learning_rate": 1.146173382671839e-07, "loss": 0.2544, "num_input_tokens_seen": 7134528, "step": 37425 }, { "epoch": 19.454261954261955, "grad_norm": 1.0055338144302368, "learning_rate": 1.1353533672763261e-07, "loss": 0.0716, "num_input_tokens_seen": 7135520, "step": 37430 }, { "epoch": 19.456860706860706, "grad_norm": 0.6144015789031982, "learning_rate": 1.124584549269031e-07, "loss": 0.1168, "num_input_tokens_seen": 7136544, "step": 37435 }, { "epoch": 19.45945945945946, "grad_norm": 0.11173281073570251, "learning_rate": 1.1138669308654314e-07, "loss": 0.103, "num_input_tokens_seen": 7137376, "step": 37440 }, { "epoch": 19.462058212058214, "grad_norm": 0.4522971510887146, "learning_rate": 1.1032005142703195e-07, "loss": 0.0263, "num_input_tokens_seen": 7138208, "step": 37445 }, { "epoch": 19.464656964656964, "grad_norm": 0.46762815117836, "learning_rate": 1.0925853016780784e-07, "loss": 0.2115, "num_input_tokens_seen": 7139232, "step": 37450 }, { "epoch": 19.467255717255718, "grad_norm": 0.49050506949424744, "learning_rate": 1.0820212952724896e-07, "loss": 0.1168, "num_input_tokens_seen": 7140192, "step": 37455 }, { "epoch": 19.46985446985447, "grad_norm": 0.2872164249420166, "learning_rate": 1.0715084972268418e-07, "loss": 0.1024, "num_input_tokens_seen": 7141152, "step": 37460 }, { "epoch": 19.472453222453222, "grad_norm": 0.10281388461589813, "learning_rate": 1.06104690970385e-07, "loss": 0.1656, "num_input_tokens_seen": 7142208, "step": 37465 }, { "epoch": 19.475051975051976, "grad_norm": 0.3023972809314728, "learning_rate": 1.0506365348557091e-07, "loss": 0.1165, "num_input_tokens_seen": 7143136, "step": 37470 }, { "epoch": 19.477650727650726, "grad_norm": 0.8498643636703491, "learning_rate": 1.0402773748240947e-07, "loss": 0.1917, "num_input_tokens_seen": 7144128, "step": 37475 }, { "epoch": 19.48024948024948, "grad_norm": 0.6830626130104065, "learning_rate": 1.0299694317401353e-07, "loss": 0.0816, "num_input_tokens_seen": 7145120, "step": 37480 }, { "epoch": 19.482848232848234, "grad_norm": 0.1110248938202858, "learning_rate": 1.0197127077244129e-07, "loss": 0.0851, "num_input_tokens_seen": 7146048, "step": 37485 }, { "epoch": 19.485446985446984, "grad_norm": 0.5572691559791565, "learning_rate": 1.0095072048869614e-07, "loss": 0.1408, "num_input_tokens_seen": 7146944, "step": 37490 }, { "epoch": 19.488045738045738, "grad_norm": 0.8475772142410278, "learning_rate": 9.993529253273237e-08, "loss": 0.116, "num_input_tokens_seen": 7147904, "step": 37495 }, { "epoch": 19.490644490644492, "grad_norm": 1.2221381664276123, "learning_rate": 9.892498711344678e-08, "loss": 0.1881, "num_input_tokens_seen": 7148960, "step": 37500 }, { "epoch": 19.493243243243242, "grad_norm": 0.04724970832467079, "learning_rate": 9.791980443868421e-08, "loss": 0.099, "num_input_tokens_seen": 7149920, "step": 37505 }, { "epoch": 19.495841995841996, "grad_norm": 0.909550666809082, "learning_rate": 9.691974471523202e-08, "loss": 0.0691, "num_input_tokens_seen": 7150880, "step": 37510 }, { "epoch": 19.49844074844075, "grad_norm": 0.5152624249458313, "learning_rate": 9.592480814882843e-08, "loss": 0.1603, "num_input_tokens_seen": 7151744, "step": 37515 }, { "epoch": 19.5010395010395, "grad_norm": 0.5798041224479675, "learning_rate": 9.493499494415414e-08, "loss": 0.1808, "num_input_tokens_seen": 7152768, "step": 37520 }, { "epoch": 19.503638253638254, "grad_norm": 0.21259775757789612, "learning_rate": 9.395030530484072e-08, "loss": 0.1729, "num_input_tokens_seen": 7153856, "step": 37525 }, { "epoch": 19.506237006237008, "grad_norm": 0.2743983864784241, "learning_rate": 9.297073943345946e-08, "loss": 0.091, "num_input_tokens_seen": 7154784, "step": 37530 }, { "epoch": 19.508835758835758, "grad_norm": 0.34014034271240234, "learning_rate": 9.199629753152972e-08, "loss": 0.158, "num_input_tokens_seen": 7155840, "step": 37535 }, { "epoch": 19.511434511434512, "grad_norm": 1.4918479919433594, "learning_rate": 9.102697979951613e-08, "loss": 0.226, "num_input_tokens_seen": 7156864, "step": 37540 }, { "epoch": 19.514033264033262, "grad_norm": 1.079309105873108, "learning_rate": 9.006278643683696e-08, "loss": 0.132, "num_input_tokens_seen": 7157824, "step": 37545 }, { "epoch": 19.516632016632016, "grad_norm": 0.890350878238678, "learning_rate": 8.910371764184466e-08, "loss": 0.1228, "num_input_tokens_seen": 7158752, "step": 37550 }, { "epoch": 19.51923076923077, "grad_norm": 0.13012953102588654, "learning_rate": 8.814977361184251e-08, "loss": 0.1842, "num_input_tokens_seen": 7159680, "step": 37555 }, { "epoch": 19.52182952182952, "grad_norm": 0.8926715850830078, "learning_rate": 8.720095454308186e-08, "loss": 0.1641, "num_input_tokens_seen": 7160544, "step": 37560 }, { "epoch": 19.524428274428274, "grad_norm": 0.20830278098583221, "learning_rate": 8.625726063075656e-08, "loss": 0.1159, "num_input_tokens_seen": 7161472, "step": 37565 }, { "epoch": 19.527027027027028, "grad_norm": 0.08510120213031769, "learning_rate": 8.531869206900578e-08, "loss": 0.116, "num_input_tokens_seen": 7162400, "step": 37570 }, { "epoch": 19.52962577962578, "grad_norm": 1.5521626472473145, "learning_rate": 8.438524905091393e-08, "loss": 0.1517, "num_input_tokens_seen": 7163360, "step": 37575 }, { "epoch": 19.532224532224532, "grad_norm": 1.422926664352417, "learning_rate": 8.34569317685191e-08, "loss": 0.254, "num_input_tokens_seen": 7164256, "step": 37580 }, { "epoch": 19.534823284823286, "grad_norm": 1.240492820739746, "learning_rate": 8.253374041279072e-08, "loss": 0.2045, "num_input_tokens_seen": 7165184, "step": 37585 }, { "epoch": 19.537422037422036, "grad_norm": 1.1931846141815186, "learning_rate": 8.161567517365465e-08, "loss": 0.2791, "num_input_tokens_seen": 7166176, "step": 37590 }, { "epoch": 19.54002079002079, "grad_norm": 0.09208046644926071, "learning_rate": 8.070273623997926e-08, "loss": 0.0602, "num_input_tokens_seen": 7167136, "step": 37595 }, { "epoch": 19.542619542619544, "grad_norm": 1.2290359735488892, "learning_rate": 7.979492379957543e-08, "loss": 0.1672, "num_input_tokens_seen": 7168032, "step": 37600 }, { "epoch": 19.545218295218294, "grad_norm": 0.6254697442054749, "learning_rate": 7.889223803920764e-08, "loss": 0.1591, "num_input_tokens_seen": 7168960, "step": 37605 }, { "epoch": 19.54781704781705, "grad_norm": 1.053346037864685, "learning_rate": 7.799467914457181e-08, "loss": 0.1352, "num_input_tokens_seen": 7169824, "step": 37610 }, { "epoch": 19.5504158004158, "grad_norm": 0.6155233383178711, "learning_rate": 7.7102247300323e-08, "loss": 0.0904, "num_input_tokens_seen": 7170784, "step": 37615 }, { "epoch": 19.553014553014552, "grad_norm": 0.34678617119789124, "learning_rate": 7.621494269005324e-08, "loss": 0.1026, "num_input_tokens_seen": 7171808, "step": 37620 }, { "epoch": 19.555613305613306, "grad_norm": 0.8398536443710327, "learning_rate": 7.533276549630264e-08, "loss": 0.0925, "num_input_tokens_seen": 7172768, "step": 37625 }, { "epoch": 19.558212058212057, "grad_norm": 0.7224915027618408, "learning_rate": 7.445571590055655e-08, "loss": 0.145, "num_input_tokens_seen": 7173696, "step": 37630 }, { "epoch": 19.56081081081081, "grad_norm": 1.4969333410263062, "learning_rate": 7.358379408324289e-08, "loss": 0.1111, "num_input_tokens_seen": 7174688, "step": 37635 }, { "epoch": 19.563409563409564, "grad_norm": 1.3265891075134277, "learning_rate": 7.271700022374039e-08, "loss": 0.1121, "num_input_tokens_seen": 7175680, "step": 37640 }, { "epoch": 19.566008316008315, "grad_norm": 0.4241807758808136, "learning_rate": 7.185533450036752e-08, "loss": 0.0447, "num_input_tokens_seen": 7176576, "step": 37645 }, { "epoch": 19.56860706860707, "grad_norm": 1.039385199546814, "learning_rate": 7.099879709038804e-08, "loss": 0.1834, "num_input_tokens_seen": 7177504, "step": 37650 }, { "epoch": 19.571205821205822, "grad_norm": 0.15194936096668243, "learning_rate": 7.014738817001654e-08, "loss": 0.1028, "num_input_tokens_seen": 7178496, "step": 37655 }, { "epoch": 19.573804573804573, "grad_norm": 1.4306913614273071, "learning_rate": 6.93011079144018e-08, "loss": 0.1187, "num_input_tokens_seen": 7179424, "step": 37660 }, { "epoch": 19.576403326403327, "grad_norm": 0.1323123276233673, "learning_rate": 6.845995649764625e-08, "loss": 0.1855, "num_input_tokens_seen": 7180448, "step": 37665 }, { "epoch": 19.57900207900208, "grad_norm": 0.26810887455940247, "learning_rate": 6.762393409279755e-08, "loss": 0.0782, "num_input_tokens_seen": 7181376, "step": 37670 }, { "epoch": 19.58160083160083, "grad_norm": 0.9735679626464844, "learning_rate": 6.679304087184313e-08, "loss": 0.0625, "num_input_tokens_seen": 7182304, "step": 37675 }, { "epoch": 19.584199584199585, "grad_norm": 1.0860530138015747, "learning_rate": 6.596727700571847e-08, "loss": 0.0705, "num_input_tokens_seen": 7183328, "step": 37680 }, { "epoch": 19.58679833679834, "grad_norm": 0.684960126876831, "learning_rate": 6.51466426643016e-08, "loss": 0.2967, "num_input_tokens_seen": 7184224, "step": 37685 }, { "epoch": 19.58939708939709, "grad_norm": 0.4492851197719574, "learning_rate": 6.433113801641577e-08, "loss": 0.0542, "num_input_tokens_seen": 7185120, "step": 37690 }, { "epoch": 19.591995841995843, "grad_norm": 1.024964451789856, "learning_rate": 6.352076322983236e-08, "loss": 0.1538, "num_input_tokens_seen": 7186176, "step": 37695 }, { "epoch": 19.594594594594593, "grad_norm": 0.10284353792667389, "learning_rate": 6.271551847126245e-08, "loss": 0.0815, "num_input_tokens_seen": 7187200, "step": 37700 }, { "epoch": 19.597193347193347, "grad_norm": 0.31787899136543274, "learning_rate": 6.191540390636796e-08, "loss": 0.1119, "num_input_tokens_seen": 7188128, "step": 37705 }, { "epoch": 19.5997920997921, "grad_norm": 1.2086910009384155, "learning_rate": 6.112041969974503e-08, "loss": 0.1103, "num_input_tokens_seen": 7189056, "step": 37710 }, { "epoch": 19.60239085239085, "grad_norm": 0.6140233278274536, "learning_rate": 6.033056601494613e-08, "loss": 0.2608, "num_input_tokens_seen": 7189984, "step": 37715 }, { "epoch": 19.604989604989605, "grad_norm": 1.1896299123764038, "learning_rate": 5.9545843014460754e-08, "loss": 0.1168, "num_input_tokens_seen": 7190976, "step": 37720 }, { "epoch": 19.60758835758836, "grad_norm": 0.15275533497333527, "learning_rate": 5.876625085972642e-08, "loss": 0.2052, "num_input_tokens_seen": 7191904, "step": 37725 }, { "epoch": 19.61018711018711, "grad_norm": 0.13135772943496704, "learning_rate": 5.799178971112318e-08, "loss": 0.112, "num_input_tokens_seen": 7192800, "step": 37730 }, { "epoch": 19.612785862785863, "grad_norm": 1.651266098022461, "learning_rate": 5.722245972797635e-08, "loss": 0.2336, "num_input_tokens_seen": 7193696, "step": 37735 }, { "epoch": 19.615384615384617, "grad_norm": 0.910911500453949, "learning_rate": 5.6458261068553787e-08, "loss": 0.1431, "num_input_tokens_seen": 7194656, "step": 37740 }, { "epoch": 19.617983367983367, "grad_norm": 0.14558640122413635, "learning_rate": 5.5699193890074165e-08, "loss": 0.0485, "num_input_tokens_seen": 7195584, "step": 37745 }, { "epoch": 19.62058212058212, "grad_norm": 0.16875918209552765, "learning_rate": 5.4945258348690354e-08, "loss": 0.0551, "num_input_tokens_seen": 7196512, "step": 37750 }, { "epoch": 19.623180873180875, "grad_norm": 1.6547532081604004, "learning_rate": 5.419645459950606e-08, "loss": 0.1734, "num_input_tokens_seen": 7197440, "step": 37755 }, { "epoch": 19.625779625779625, "grad_norm": 0.8806331753730774, "learning_rate": 5.345278279656751e-08, "loss": 0.0855, "num_input_tokens_seen": 7198400, "step": 37760 }, { "epoch": 19.62837837837838, "grad_norm": 1.7142409086227417, "learning_rate": 5.271424309286899e-08, "loss": 0.1552, "num_input_tokens_seen": 7199424, "step": 37765 }, { "epoch": 19.63097713097713, "grad_norm": 0.6169254183769226, "learning_rate": 5.1980835640344505e-08, "loss": 0.1076, "num_input_tokens_seen": 7200384, "step": 37770 }, { "epoch": 19.633575883575883, "grad_norm": 0.9519529938697815, "learning_rate": 5.12525605898706e-08, "loss": 0.1272, "num_input_tokens_seen": 7201440, "step": 37775 }, { "epoch": 19.636174636174637, "grad_norm": 0.5576204657554626, "learning_rate": 5.052941809127465e-08, "loss": 0.1969, "num_input_tokens_seen": 7202432, "step": 37780 }, { "epoch": 19.638773388773387, "grad_norm": 1.1351816654205322, "learning_rate": 4.9811408293320984e-08, "loss": 0.0665, "num_input_tokens_seen": 7203392, "step": 37785 }, { "epoch": 19.64137214137214, "grad_norm": 0.6200825572013855, "learning_rate": 4.9098531343724773e-08, "loss": 0.0928, "num_input_tokens_seen": 7204352, "step": 37790 }, { "epoch": 19.643970893970895, "grad_norm": 0.6325084567070007, "learning_rate": 4.839078738913816e-08, "loss": 0.0956, "num_input_tokens_seen": 7205280, "step": 37795 }, { "epoch": 19.646569646569645, "grad_norm": 1.3153988122940063, "learning_rate": 4.768817657516411e-08, "loss": 0.1632, "num_input_tokens_seen": 7206240, "step": 37800 }, { "epoch": 19.6491683991684, "grad_norm": 0.13747796416282654, "learning_rate": 4.699069904634257e-08, "loss": 0.1256, "num_input_tokens_seen": 7207200, "step": 37805 }, { "epoch": 19.651767151767153, "grad_norm": 0.18639791011810303, "learning_rate": 4.629835494616708e-08, "loss": 0.1054, "num_input_tokens_seen": 7208192, "step": 37810 }, { "epoch": 19.654365904365903, "grad_norm": 0.21109308302402496, "learning_rate": 4.56111444170626e-08, "loss": 0.0788, "num_input_tokens_seen": 7209152, "step": 37815 }, { "epoch": 19.656964656964657, "grad_norm": 0.600543200969696, "learning_rate": 4.492906760041049e-08, "loss": 0.1755, "num_input_tokens_seen": 7210080, "step": 37820 }, { "epoch": 19.65956340956341, "grad_norm": 0.8697383999824524, "learning_rate": 4.425212463652628e-08, "loss": 0.1456, "num_input_tokens_seen": 7211072, "step": 37825 }, { "epoch": 19.66216216216216, "grad_norm": 0.7122933268547058, "learning_rate": 4.358031566467635e-08, "loss": 0.0778, "num_input_tokens_seen": 7212096, "step": 37830 }, { "epoch": 19.664760914760915, "grad_norm": 0.23437930643558502, "learning_rate": 4.291364082306404e-08, "loss": 0.1097, "num_input_tokens_seen": 7213088, "step": 37835 }, { "epoch": 19.66735966735967, "grad_norm": 0.12281487882137299, "learning_rate": 4.2252100248843515e-08, "loss": 0.0985, "num_input_tokens_seen": 7214016, "step": 37840 }, { "epoch": 19.66995841995842, "grad_norm": 0.5413916110992432, "learning_rate": 4.1595694078108706e-08, "loss": 0.1575, "num_input_tokens_seen": 7214880, "step": 37845 }, { "epoch": 19.672557172557173, "grad_norm": 0.6080290675163269, "learning_rate": 4.09444224458988e-08, "loss": 0.0932, "num_input_tokens_seen": 7215840, "step": 37850 }, { "epoch": 19.675155925155924, "grad_norm": 0.9618713855743408, "learning_rate": 4.0298285486192746e-08, "loss": 0.1637, "num_input_tokens_seen": 7216736, "step": 37855 }, { "epoch": 19.677754677754677, "grad_norm": 0.4216115474700928, "learning_rate": 3.965728333192031e-08, "loss": 0.1308, "num_input_tokens_seen": 7217664, "step": 37860 }, { "epoch": 19.68035343035343, "grad_norm": 0.10120455175638199, "learning_rate": 3.902141611494548e-08, "loss": 0.0599, "num_input_tokens_seen": 7218528, "step": 37865 }, { "epoch": 19.68295218295218, "grad_norm": 0.3839718997478485, "learning_rate": 3.839068396608858e-08, "loss": 0.1197, "num_input_tokens_seen": 7219520, "step": 37870 }, { "epoch": 19.685550935550935, "grad_norm": 0.9934536814689636, "learning_rate": 3.776508701509862e-08, "loss": 0.2563, "num_input_tokens_seen": 7220480, "step": 37875 }, { "epoch": 19.68814968814969, "grad_norm": 0.6014882326126099, "learning_rate": 3.714462539068375e-08, "loss": 0.0439, "num_input_tokens_seen": 7221408, "step": 37880 }, { "epoch": 19.69074844074844, "grad_norm": 0.8269248604774475, "learning_rate": 3.652929922048076e-08, "loss": 0.1911, "num_input_tokens_seen": 7222400, "step": 37885 }, { "epoch": 19.693347193347194, "grad_norm": 0.3394087851047516, "learning_rate": 3.591910863108006e-08, "loss": 0.073, "num_input_tokens_seen": 7223296, "step": 37890 }, { "epoch": 19.695945945945947, "grad_norm": 1.1414047479629517, "learning_rate": 3.531405374801178e-08, "loss": 0.2109, "num_input_tokens_seen": 7224192, "step": 37895 }, { "epoch": 19.698544698544698, "grad_norm": 0.34701767563819885, "learning_rate": 3.471413469575135e-08, "loss": 0.1157, "num_input_tokens_seen": 7225152, "step": 37900 }, { "epoch": 19.70114345114345, "grad_norm": 0.5003429651260376, "learning_rate": 3.411935159771395e-08, "loss": 0.0884, "num_input_tokens_seen": 7226144, "step": 37905 }, { "epoch": 19.703742203742205, "grad_norm": 1.1096858978271484, "learning_rate": 3.352970457626281e-08, "loss": 0.2628, "num_input_tokens_seen": 7227168, "step": 37910 }, { "epoch": 19.706340956340956, "grad_norm": 0.8107615113258362, "learning_rate": 3.2945193752700887e-08, "loss": 0.1184, "num_input_tokens_seen": 7228128, "step": 37915 }, { "epoch": 19.70893970893971, "grad_norm": 0.39758455753326416, "learning_rate": 3.236581924727922e-08, "loss": 0.1395, "num_input_tokens_seen": 7229088, "step": 37920 }, { "epoch": 19.71153846153846, "grad_norm": 0.4545205533504486, "learning_rate": 3.179158117918579e-08, "loss": 0.096, "num_input_tokens_seen": 7229984, "step": 37925 }, { "epoch": 19.714137214137214, "grad_norm": 0.7922695875167847, "learning_rate": 3.1222479666553894e-08, "loss": 0.0732, "num_input_tokens_seen": 7230944, "step": 37930 }, { "epoch": 19.716735966735968, "grad_norm": 1.1264615058898926, "learning_rate": 3.065851482646487e-08, "loss": 0.2012, "num_input_tokens_seen": 7231840, "step": 37935 }, { "epoch": 19.719334719334718, "grad_norm": 0.1968788504600525, "learning_rate": 3.009968677493702e-08, "loss": 0.0818, "num_input_tokens_seen": 7232800, "step": 37940 }, { "epoch": 19.721933471933472, "grad_norm": 0.5960128903388977, "learning_rate": 2.9545995626936717e-08, "loss": 0.0957, "num_input_tokens_seen": 7233728, "step": 37945 }, { "epoch": 19.724532224532226, "grad_norm": 0.12881064414978027, "learning_rate": 2.8997441496370072e-08, "loss": 0.0829, "num_input_tokens_seen": 7234624, "step": 37950 }, { "epoch": 19.727130977130976, "grad_norm": 0.7801172733306885, "learning_rate": 2.8454024496085696e-08, "loss": 0.0894, "num_input_tokens_seen": 7235584, "step": 37955 }, { "epoch": 19.72972972972973, "grad_norm": 0.7538594603538513, "learning_rate": 2.7915744737883055e-08, "loss": 0.068, "num_input_tokens_seen": 7236480, "step": 37960 }, { "epoch": 19.732328482328484, "grad_norm": 0.9501662850379944, "learning_rate": 2.7382602332493013e-08, "loss": 0.1573, "num_input_tokens_seen": 7237376, "step": 37965 }, { "epoch": 19.734927234927234, "grad_norm": 0.1585043966770172, "learning_rate": 2.6854597389600055e-08, "loss": 0.0566, "num_input_tokens_seen": 7238368, "step": 37970 }, { "epoch": 19.737525987525988, "grad_norm": 0.4572448134422302, "learning_rate": 2.633173001782563e-08, "loss": 0.0858, "num_input_tokens_seen": 7239328, "step": 37975 }, { "epoch": 19.74012474012474, "grad_norm": 0.050369855016469955, "learning_rate": 2.581400032473369e-08, "loss": 0.0606, "num_input_tokens_seen": 7240288, "step": 37980 }, { "epoch": 19.742723492723492, "grad_norm": 1.013663411140442, "learning_rate": 2.5301408416839035e-08, "loss": 0.2136, "num_input_tokens_seen": 7241216, "step": 37985 }, { "epoch": 19.745322245322246, "grad_norm": 0.28073355555534363, "learning_rate": 2.4793954399590647e-08, "loss": 0.1287, "num_input_tokens_seen": 7242176, "step": 37990 }, { "epoch": 19.747920997921, "grad_norm": 0.619683563709259, "learning_rate": 2.4291638377382798e-08, "loss": 0.0474, "num_input_tokens_seen": 7243104, "step": 37995 }, { "epoch": 19.75051975051975, "grad_norm": 0.58974289894104, "learning_rate": 2.3794460453555047e-08, "loss": 0.0366, "num_input_tokens_seen": 7244000, "step": 38000 }, { "epoch": 19.753118503118504, "grad_norm": 0.5440881252288818, "learning_rate": 2.330242073039224e-08, "loss": 0.0836, "num_input_tokens_seen": 7244896, "step": 38005 }, { "epoch": 19.755717255717254, "grad_norm": 0.858291506767273, "learning_rate": 2.2815519309113407e-08, "loss": 0.17, "num_input_tokens_seen": 7245792, "step": 38010 }, { "epoch": 19.758316008316008, "grad_norm": 0.6063377857208252, "learning_rate": 2.2333756289885656e-08, "loss": 0.0571, "num_input_tokens_seen": 7246784, "step": 38015 }, { "epoch": 19.760914760914762, "grad_norm": 0.46596500277519226, "learning_rate": 2.1857131771824135e-08, "loss": 0.1419, "num_input_tokens_seen": 7247744, "step": 38020 }, { "epoch": 19.763513513513512, "grad_norm": 0.5263004302978516, "learning_rate": 2.13856458529782e-08, "loss": 0.0665, "num_input_tokens_seen": 7248672, "step": 38025 }, { "epoch": 19.766112266112266, "grad_norm": 0.11103951185941696, "learning_rate": 2.0919298630342477e-08, "loss": 0.1946, "num_input_tokens_seen": 7249664, "step": 38030 }, { "epoch": 19.76871101871102, "grad_norm": 0.5733762383460999, "learning_rate": 2.0458090199862446e-08, "loss": 0.0527, "num_input_tokens_seen": 7250624, "step": 38035 }, { "epoch": 19.77130977130977, "grad_norm": 0.8882941007614136, "learning_rate": 2.0002020656409436e-08, "loss": 0.1543, "num_input_tokens_seen": 7251520, "step": 38040 }, { "epoch": 19.773908523908524, "grad_norm": 0.7579831480979919, "learning_rate": 1.9551090093816727e-08, "loss": 0.1873, "num_input_tokens_seen": 7252512, "step": 38045 }, { "epoch": 19.776507276507278, "grad_norm": 0.3099108338356018, "learning_rate": 1.9105298604849e-08, "loss": 0.1015, "num_input_tokens_seen": 7253632, "step": 38050 }, { "epoch": 19.77910602910603, "grad_norm": 0.7306853532791138, "learning_rate": 1.8664646281213448e-08, "loss": 0.1094, "num_input_tokens_seen": 7254624, "step": 38055 }, { "epoch": 19.781704781704782, "grad_norm": 1.0324846506118774, "learning_rate": 1.8229133213565342e-08, "loss": 0.1117, "num_input_tokens_seen": 7255616, "step": 38060 }, { "epoch": 19.784303534303533, "grad_norm": 0.8691331744194031, "learning_rate": 1.779875949149967e-08, "loss": 0.2112, "num_input_tokens_seen": 7256480, "step": 38065 }, { "epoch": 19.786902286902286, "grad_norm": 0.45611801743507385, "learning_rate": 1.7373525203553953e-08, "loss": 0.3019, "num_input_tokens_seen": 7257472, "step": 38070 }, { "epoch": 19.78950103950104, "grad_norm": 0.7759065628051758, "learning_rate": 1.6953430437208206e-08, "loss": 0.1467, "num_input_tokens_seen": 7258464, "step": 38075 }, { "epoch": 19.79209979209979, "grad_norm": 0.7143684029579163, "learning_rate": 1.6538475278887743e-08, "loss": 0.0945, "num_input_tokens_seen": 7259424, "step": 38080 }, { "epoch": 19.794698544698544, "grad_norm": 0.48646101355552673, "learning_rate": 1.6128659813957613e-08, "loss": 0.1508, "num_input_tokens_seen": 7260320, "step": 38085 }, { "epoch": 19.7972972972973, "grad_norm": 0.21970853209495544, "learning_rate": 1.5723984126728153e-08, "loss": 0.108, "num_input_tokens_seen": 7261312, "step": 38090 }, { "epoch": 19.79989604989605, "grad_norm": 1.2001203298568726, "learning_rate": 1.5324448300449435e-08, "loss": 0.1875, "num_input_tokens_seen": 7262208, "step": 38095 }, { "epoch": 19.802494802494802, "grad_norm": 1.3224233388900757, "learning_rate": 1.493005241731682e-08, "loss": 0.1587, "num_input_tokens_seen": 7263264, "step": 38100 }, { "epoch": 19.805093555093556, "grad_norm": 1.4681031703948975, "learning_rate": 1.4540796558465408e-08, "loss": 0.121, "num_input_tokens_seen": 7264128, "step": 38105 }, { "epoch": 19.807692307692307, "grad_norm": 0.3998796045780182, "learning_rate": 1.4156680803972811e-08, "loss": 0.1074, "num_input_tokens_seen": 7265120, "step": 38110 }, { "epoch": 19.81029106029106, "grad_norm": 0.7427130937576294, "learning_rate": 1.3777705232864701e-08, "loss": 0.1722, "num_input_tokens_seen": 7266144, "step": 38115 }, { "epoch": 19.812889812889814, "grad_norm": 0.1815045326948166, "learning_rate": 1.3403869923103717e-08, "loss": 0.0447, "num_input_tokens_seen": 7267072, "step": 38120 }, { "epoch": 19.815488565488565, "grad_norm": 1.9532508850097656, "learning_rate": 1.3035174951595009e-08, "loss": 0.342, "num_input_tokens_seen": 7267968, "step": 38125 }, { "epoch": 19.81808731808732, "grad_norm": 0.17685401439666748, "learning_rate": 1.267162039418901e-08, "loss": 0.1225, "num_input_tokens_seen": 7268832, "step": 38130 }, { "epoch": 19.820686070686072, "grad_norm": 0.19361215829849243, "learning_rate": 1.2313206325681447e-08, "loss": 0.2885, "num_input_tokens_seen": 7269760, "step": 38135 }, { "epoch": 19.823284823284823, "grad_norm": 0.7665359973907471, "learning_rate": 1.1959932819799457e-08, "loss": 0.1288, "num_input_tokens_seen": 7270720, "step": 38140 }, { "epoch": 19.825883575883577, "grad_norm": 0.6253753304481506, "learning_rate": 1.1611799949226565e-08, "loss": 0.2469, "num_input_tokens_seen": 7271712, "step": 38145 }, { "epoch": 19.828482328482327, "grad_norm": 0.20674744248390198, "learning_rate": 1.1268807785577707e-08, "loss": 0.1151, "num_input_tokens_seen": 7272608, "step": 38150 }, { "epoch": 19.83108108108108, "grad_norm": 0.44878682494163513, "learning_rate": 1.0930956399415881e-08, "loss": 0.1232, "num_input_tokens_seen": 7273536, "step": 38155 }, { "epoch": 19.833679833679835, "grad_norm": 0.34292709827423096, "learning_rate": 1.0598245860243828e-08, "loss": 0.1073, "num_input_tokens_seen": 7274432, "step": 38160 }, { "epoch": 19.836278586278585, "grad_norm": 1.0401785373687744, "learning_rate": 1.027067623650957e-08, "loss": 0.154, "num_input_tokens_seen": 7275424, "step": 38165 }, { "epoch": 19.83887733887734, "grad_norm": 0.19862601161003113, "learning_rate": 9.948247595603643e-09, "loss": 0.1013, "num_input_tokens_seen": 7276384, "step": 38170 }, { "epoch": 19.841476091476093, "grad_norm": 0.13622762262821198, "learning_rate": 9.63096000385355e-09, "loss": 0.0745, "num_input_tokens_seen": 7277312, "step": 38175 }, { "epoch": 19.844074844074843, "grad_norm": 1.246492862701416, "learning_rate": 9.318813526534853e-09, "loss": 0.1995, "num_input_tokens_seen": 7278208, "step": 38180 }, { "epoch": 19.846673596673597, "grad_norm": 0.33456873893737793, "learning_rate": 9.011808227865625e-09, "loss": 0.0667, "num_input_tokens_seen": 7279104, "step": 38185 }, { "epoch": 19.84927234927235, "grad_norm": 0.06949407607316971, "learning_rate": 8.70994417099813e-09, "loss": 0.1553, "num_input_tokens_seen": 7280000, "step": 38190 }, { "epoch": 19.8518711018711, "grad_norm": 0.7256118059158325, "learning_rate": 8.413221418041018e-09, "loss": 0.0845, "num_input_tokens_seen": 7280928, "step": 38195 }, { "epoch": 19.854469854469855, "grad_norm": 1.441412091255188, "learning_rate": 8.121640030028798e-09, "loss": 0.2997, "num_input_tokens_seen": 7281920, "step": 38200 }, { "epoch": 19.85706860706861, "grad_norm": 0.7531623840332031, "learning_rate": 7.83520006695515e-09, "loss": 0.0814, "num_input_tokens_seen": 7282848, "step": 38205 }, { "epoch": 19.85966735966736, "grad_norm": 0.7875951528549194, "learning_rate": 7.55390158773961e-09, "loss": 0.1091, "num_input_tokens_seen": 7283840, "step": 38210 }, { "epoch": 19.862266112266113, "grad_norm": 1.8580816984176636, "learning_rate": 7.2777446502581006e-09, "loss": 0.1994, "num_input_tokens_seen": 7284736, "step": 38215 }, { "epoch": 19.864864864864863, "grad_norm": 0.5591815710067749, "learning_rate": 7.006729311320737e-09, "loss": 0.134, "num_input_tokens_seen": 7285632, "step": 38220 }, { "epoch": 19.867463617463617, "grad_norm": 0.24784652888774872, "learning_rate": 6.7408556266801425e-09, "loss": 0.0445, "num_input_tokens_seen": 7286496, "step": 38225 }, { "epoch": 19.87006237006237, "grad_norm": 1.1472890377044678, "learning_rate": 6.480123651034231e-09, "loss": 0.2176, "num_input_tokens_seen": 7287424, "step": 38230 }, { "epoch": 19.87266112266112, "grad_norm": 0.28074806928634644, "learning_rate": 6.224533438020652e-09, "loss": 0.0772, "num_input_tokens_seen": 7288384, "step": 38235 }, { "epoch": 19.875259875259875, "grad_norm": 0.5964701771736145, "learning_rate": 5.974085040222344e-09, "loss": 0.1487, "num_input_tokens_seen": 7289280, "step": 38240 }, { "epoch": 19.87785862785863, "grad_norm": 1.0149630308151245, "learning_rate": 5.728778509161981e-09, "loss": 0.0849, "num_input_tokens_seen": 7290240, "step": 38245 }, { "epoch": 19.88045738045738, "grad_norm": 0.7380611300468445, "learning_rate": 5.4886138953047505e-09, "loss": 0.0841, "num_input_tokens_seen": 7291136, "step": 38250 }, { "epoch": 19.883056133056133, "grad_norm": 0.27746015787124634, "learning_rate": 5.253591248058354e-09, "loss": 0.093, "num_input_tokens_seen": 7292128, "step": 38255 }, { "epoch": 19.885654885654887, "grad_norm": 0.43571770191192627, "learning_rate": 5.023710615773003e-09, "loss": 0.1742, "num_input_tokens_seen": 7292992, "step": 38260 }, { "epoch": 19.888253638253637, "grad_norm": 1.1170361042022705, "learning_rate": 4.798972045738647e-09, "loss": 0.1874, "num_input_tokens_seen": 7293952, "step": 38265 }, { "epoch": 19.89085239085239, "grad_norm": 0.752674400806427, "learning_rate": 4.579375584190526e-09, "loss": 0.1791, "num_input_tokens_seen": 7294912, "step": 38270 }, { "epoch": 19.893451143451145, "grad_norm": 0.5398762226104736, "learning_rate": 4.3649212763036125e-09, "loss": 0.0784, "num_input_tokens_seen": 7295808, "step": 38275 }, { "epoch": 19.896049896049895, "grad_norm": 1.33090341091156, "learning_rate": 4.1556091662009465e-09, "loss": 0.1782, "num_input_tokens_seen": 7296768, "step": 38280 }, { "epoch": 19.89864864864865, "grad_norm": 1.8213369846343994, "learning_rate": 3.951439296936976e-09, "loss": 0.1605, "num_input_tokens_seen": 7297728, "step": 38285 }, { "epoch": 19.901247401247403, "grad_norm": 0.13777442276477814, "learning_rate": 3.75241171051699e-09, "loss": 0.1042, "num_input_tokens_seen": 7298656, "step": 38290 }, { "epoch": 19.903846153846153, "grad_norm": 0.6506609320640564, "learning_rate": 3.558526447888788e-09, "loss": 0.0758, "num_input_tokens_seen": 7299520, "step": 38295 }, { "epoch": 19.906444906444907, "grad_norm": 0.4108006954193115, "learning_rate": 3.3697835489343575e-09, "loss": 0.0772, "num_input_tokens_seen": 7300448, "step": 38300 }, { "epoch": 19.909043659043657, "grad_norm": 0.5560332536697388, "learning_rate": 3.1861830524837487e-09, "loss": 0.1193, "num_input_tokens_seen": 7301376, "step": 38305 }, { "epoch": 19.91164241164241, "grad_norm": 1.1865962743759155, "learning_rate": 3.0077249963095246e-09, "loss": 0.1364, "num_input_tokens_seen": 7302400, "step": 38310 }, { "epoch": 19.914241164241165, "grad_norm": 0.2625444829463959, "learning_rate": 2.8344094171212086e-09, "loss": 0.0549, "num_input_tokens_seen": 7303296, "step": 38315 }, { "epoch": 19.916839916839916, "grad_norm": 1.3630191087722778, "learning_rate": 2.6662363505791656e-09, "loss": 0.1727, "num_input_tokens_seen": 7304256, "step": 38320 }, { "epoch": 19.91943866943867, "grad_norm": 0.17450928688049316, "learning_rate": 2.503205831277944e-09, "loss": 0.1189, "num_input_tokens_seen": 7305184, "step": 38325 }, { "epoch": 19.922037422037423, "grad_norm": 1.626731276512146, "learning_rate": 2.3453178927573816e-09, "loss": 0.1511, "num_input_tokens_seen": 7306144, "step": 38330 }, { "epoch": 19.924636174636174, "grad_norm": 0.7077884674072266, "learning_rate": 2.1925725674970533e-09, "loss": 0.1277, "num_input_tokens_seen": 7307104, "step": 38335 }, { "epoch": 19.927234927234927, "grad_norm": 0.32114067673683167, "learning_rate": 2.0449698869218214e-09, "loss": 0.0621, "num_input_tokens_seen": 7308096, "step": 38340 }, { "epoch": 19.92983367983368, "grad_norm": 0.11669863760471344, "learning_rate": 1.902509881396286e-09, "loss": 0.1874, "num_input_tokens_seen": 7309024, "step": 38345 }, { "epoch": 19.93243243243243, "grad_norm": 1.026727557182312, "learning_rate": 1.7651925802303347e-09, "loss": 0.1682, "num_input_tokens_seen": 7309952, "step": 38350 }, { "epoch": 19.935031185031185, "grad_norm": 0.9752469658851624, "learning_rate": 1.6330180116708172e-09, "loss": 0.1273, "num_input_tokens_seen": 7310912, "step": 38355 }, { "epoch": 19.93762993762994, "grad_norm": 0.2175494134426117, "learning_rate": 1.5059862029070947e-09, "loss": 0.1036, "num_input_tokens_seen": 7311840, "step": 38360 }, { "epoch": 19.94022869022869, "grad_norm": 0.21199147403240204, "learning_rate": 1.3840971800793689e-09, "loss": 0.0845, "num_input_tokens_seen": 7312800, "step": 38365 }, { "epoch": 19.942827442827443, "grad_norm": 0.531916081905365, "learning_rate": 1.2673509682564755e-09, "loss": 0.0855, "num_input_tokens_seen": 7313760, "step": 38370 }, { "epoch": 19.945426195426194, "grad_norm": 0.38776710629463196, "learning_rate": 1.1557475914580895e-09, "loss": 0.1138, "num_input_tokens_seen": 7314784, "step": 38375 }, { "epoch": 19.948024948024948, "grad_norm": 0.1875813752412796, "learning_rate": 1.0492870726463987e-09, "loss": 0.1106, "num_input_tokens_seen": 7315680, "step": 38380 }, { "epoch": 19.9506237006237, "grad_norm": 0.7443088889122009, "learning_rate": 9.479694337205524e-10, "loss": 0.1334, "num_input_tokens_seen": 7316672, "step": 38385 }, { "epoch": 19.953222453222452, "grad_norm": 0.15693733096122742, "learning_rate": 8.517946955222123e-10, "loss": 0.0361, "num_input_tokens_seen": 7317568, "step": 38390 }, { "epoch": 19.955821205821206, "grad_norm": 0.21726340055465698, "learning_rate": 7.607628778383281e-10, "loss": 0.1069, "num_input_tokens_seen": 7318400, "step": 38395 }, { "epoch": 19.95841995841996, "grad_norm": 0.16852335631847382, "learning_rate": 6.748739993983622e-10, "loss": 0.0753, "num_input_tokens_seen": 7319360, "step": 38400 }, { "epoch": 19.96101871101871, "grad_norm": 0.7118948698043823, "learning_rate": 5.941280778687386e-10, "loss": 0.1131, "num_input_tokens_seen": 7320288, "step": 38405 }, { "epoch": 19.963617463617464, "grad_norm": 0.030060047283768654, "learning_rate": 5.185251298639449e-10, "loss": 0.0505, "num_input_tokens_seen": 7321216, "step": 38410 }, { "epoch": 19.966216216216218, "grad_norm": 0.7489647269248962, "learning_rate": 4.4806517093265444e-10, "loss": 0.1433, "num_input_tokens_seen": 7322176, "step": 38415 }, { "epoch": 19.968814968814968, "grad_norm": 0.6786806583404541, "learning_rate": 3.8274821557438e-10, "loss": 0.0375, "num_input_tokens_seen": 7323136, "step": 38420 }, { "epoch": 19.97141372141372, "grad_norm": 0.5750435590744019, "learning_rate": 3.2257427722559574e-10, "loss": 0.1064, "num_input_tokens_seen": 7324064, "step": 38425 }, { "epoch": 19.974012474012476, "grad_norm": 0.3177357017993927, "learning_rate": 2.6754336826528837e-10, "loss": 0.0495, "num_input_tokens_seen": 7324928, "step": 38430 }, { "epoch": 19.976611226611226, "grad_norm": 0.7064406275749207, "learning_rate": 2.176555000121816e-10, "loss": 0.1053, "num_input_tokens_seen": 7325888, "step": 38435 }, { "epoch": 19.97920997920998, "grad_norm": 1.5298280715942383, "learning_rate": 1.7291068273306288e-10, "loss": 0.2411, "num_input_tokens_seen": 7326784, "step": 38440 }, { "epoch": 19.981808731808734, "grad_norm": 0.21005263924598694, "learning_rate": 1.3330892563168107e-10, "loss": 0.0481, "num_input_tokens_seen": 7327680, "step": 38445 }, { "epoch": 19.984407484407484, "grad_norm": 0.1396220624446869, "learning_rate": 9.885023685152206e-11, "loss": 0.0448, "num_input_tokens_seen": 7328608, "step": 38450 }, { "epoch": 19.987006237006238, "grad_norm": 0.7097146511077881, "learning_rate": 6.953462348691098e-11, "loss": 0.1303, "num_input_tokens_seen": 7329600, "step": 38455 }, { "epoch": 19.989604989604988, "grad_norm": 0.15674562752246857, "learning_rate": 4.53620915691344e-11, "loss": 0.0776, "num_input_tokens_seen": 7330496, "step": 38460 }, { "epoch": 19.992203742203742, "grad_norm": 0.5270010828971863, "learning_rate": 2.6332646063664812e-11, "loss": 0.1508, "num_input_tokens_seen": 7331456, "step": 38465 }, { "epoch": 19.994802494802496, "grad_norm": 0.0911172404885292, "learning_rate": 1.24462908951406e-11, "loss": 0.1028, "num_input_tokens_seen": 7332384, "step": 38470 }, { "epoch": 19.997401247401246, "grad_norm": 1.360542893409729, "learning_rate": 3.703028911283823e-12, "loss": 0.1778, "num_input_tokens_seen": 7333344, "step": 38475 }, { "epoch": 20.0, "grad_norm": 0.4419593811035156, "learning_rate": 1.0286191620689778e-13, "loss": 0.1629, "num_input_tokens_seen": 7334376, "step": 38480 }, { "epoch": 20.0, "eval_loss": 0.14645375311374664, "eval_runtime": 8.0178, "eval_samples_per_second": 106.763, "eval_steps_per_second": 26.691, "num_input_tokens_seen": 7334376, "step": 38480 }, { "epoch": 20.0, "num_input_tokens_seen": 7334376, "step": 38480, "total_flos": 3.302722883587277e+17, "train_loss": 0.16706526167787564, "train_runtime": 5567.35, "train_samples_per_second": 27.643, "train_steps_per_second": 6.912 } ], "logging_steps": 5, "max_steps": 38480, "num_input_tokens_seen": 7334376, "num_train_epochs": 20, "save_steps": 1924, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.302722883587277e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }